diff --git a/.github/actions/ccache-clear/action.yml b/.github/actions/ccache-clear/action.yml
new file mode 100644
index 00000000000..d38587efaf8
--- /dev/null
+++ b/.github/actions/ccache-clear/action.yml
@@ -0,0 +1,22 @@
+name: "ccache-clear"
+description: "Delete all GitHub Actions caches matching a key prefix"
+inputs:
+  key:
+    description: "Cache key prefix to match and delete"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Clear caches
+      shell: bash
+      run: |
+        CACHES=$(gh cache list --key "ccache-${{ inputs.key }}" --json id,key --jq '.[] | "\(.id) \(.key)"' 2>/dev/null)
+        if [ -z "$CACHES" ]; then
+          echo "No caches found with key prefix: ${{ inputs.key }}"
+          exit 0
+        fi
+        while read -r id key; do
+          echo "Deleting cache: $id ($key)"
+          gh cache delete "$id"
+        done <<< "$CACHES"
diff --git a/.github/workflows/build-cuda-windows.yml b/.github/workflows/build-cuda-windows.yml
index 631ff4ed26b..e9e941421b6 100644
--- a/.github/workflows/build-cuda-windows.yml
+++ b/.github/workflows/build-cuda-windows.yml
@@ -13,6 +13,7 @@ concurrency:
   queue: max
 
 env:
+  GH_TOKEN: ${{ github.token }}
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
   LLAMA_ARG_LOG_COLORS: 1
@@ -23,6 +24,9 @@ jobs:
   cuda:
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         cuda: ['12.4', '13.3']
@@ -36,7 +40,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Install Cuda Toolkit
         uses: ./.github/actions/windows-setup-cuda
@@ -67,9 +70,17 @@ jobs:
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
   hip:
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     env:
       # Make sure this is in sync with build-cache.yml
       HIPSDK_INSTALLER_VERSION: "26.Q1"
@@ -125,7 +136,6 @@ jobs:
           #       to populate the ccache for the release with manual runs of this workflow
           #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
           key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Build
         id: cmake_build
@@ -144,3 +154,9 @@ jobs:
             -DGPU_TARGETS="gfx1100"  `
             -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml
index e6eab8fd0aa..d473b14c11d 100644
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -52,14 +52,6 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-${{ matrix.os }}
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
       - name: Dependencies
         id: depends
         run: |
@@ -68,14 +60,20 @@ jobs:
           echo "CC=gcc-14" >> "$GITHUB_ENV"
           echo "CXX=g++-14" >> "$GITHUB_ENV"
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-${{ matrix.os }}-new
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
       - name: Configure
         id: cmake_configure
         run: |
           cmake -B build \
             -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DCMAKE_BUILD_TYPE=Release \
             -DGGML_VULKAN=ON
 
       - name: Build
@@ -91,13 +89,6 @@ jobs:
         id: checkout
         uses: actions/checkout@v6
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-ubuntu-24.04-llvmpipe
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
       - name: Dependencies
         id: depends
         run: |
@@ -124,6 +115,13 @@ jobs:
           path: ./vulkan_sdk
           version: ${{ env.VULKAN_SDK_VERSION }}
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-ubuntu-24.04-llvmpipe
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
       - name: Build
         id: cmake_build
         run: |
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index c3a018425e2..a1642fc2229 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -28,6 +28,7 @@ on:
     ]
 
 env:
+  GH_TOKEN: ${{ github.token }}
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
 
@@ -37,7 +38,7 @@ concurrency:
   queue: max
 
 jobs:
-  check_release:
+  check-release:
     runs-on: ubuntu-slim
 
     outputs:
@@ -59,14 +60,14 @@ jobs:
           fi
 
   macos-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
           - build: 'arm64'
             arch: 'arm64'
-            os: macos-14
+            os: macos-26
             defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
           # TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
           #       in order to enable it again, we have to provision dedicated runners  to run it
@@ -83,6 +84,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -101,7 +105,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-${{ matrix.os }}-${{ matrix.arch }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Build
         id: cmake_build
@@ -116,6 +119,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-${{ matrix.arch }}
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -133,8 +141,8 @@ jobs:
           name: llama-bin-macos-${{ matrix.build }}.tar.gz
 
   ubuntu-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     strategy:
       matrix:
         include:
@@ -147,6 +155,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -161,13 +172,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        if: ${{ matrix.build != 's390x' }}
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-${{ matrix.os }}-cpu
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Dependencies
         id: depends
         run: |
@@ -181,6 +185,12 @@ jobs:
           echo "CC=gcc-14" >> "$GITHUB_ENV"
           echo "CXX=g++-14" >> "$GITHUB_ENV"
 
+      - name: ccache
+        if: ${{ matrix.build != 's390x' }}
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-${{ matrix.os }}-cpu
+
       - name: Build
         id: cmake_build
         run: |
@@ -194,6 +204,12 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        if: ${{ matrix.build != 's390x' }}
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-cpu
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -211,8 +227,8 @@ jobs:
           name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
 
   ubuntu-vulkan:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     strategy:
       matrix:
@@ -224,6 +240,9 @@ jobs:
 
     runs-on: ${{ matrix.os }}
 
+    permissions:
+      actions: write
+
     steps:
       - name: Clone
         id: checkout
@@ -238,12 +257,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-${{ matrix.os }}-vulkan
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Dependencies
         id: depends
         run: |
@@ -259,6 +272,11 @@ jobs:
             echo "CXX=g++-14" >> "$GITHUB_ENV"
           fi
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-${{ matrix.os }}-vulkan
+
       - name: Build
         id: cmake_build
         run: |
@@ -272,6 +290,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-${{ matrix.os }}-vulkan
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -289,11 +312,14 @@ jobs:
           name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
 
   android-arm64:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-latest
 
+    #permissions:
+    #  actions: write
+
     env:
       NDK_VERSION: "29.0.14206865"
 
@@ -311,18 +337,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
-      #        for some reason, the ccache does not improve the build time in this case
-      # example:
-      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
-      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
-      #
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@v1.2.21
-      #  with:
-      #    key: release-android-arm64
-      #    append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Set up JDK
         uses: actions/setup-java@v5
         with:
@@ -339,6 +353,17 @@ jobs:
           sdkmanager "ndk;${{ env.NDK_VERSION }}"
           echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
 
+      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
+      #        for some reason, the ccache does not improve the build time in this case
+      # example:
+      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
+      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
+      #
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: release-android-arm64
+
       - name: Build
         id: cmake_build
         run: |
@@ -357,6 +382,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      #- name: ccache-clear
+      #  uses: ./.github/actions/ccache-clear
+      #  with:
+      #    key: release-android-arm64
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -374,11 +404,14 @@ jobs:
           name: llama-bin-android-arm64.tar.gz
 
   ubuntu-24-openvino:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-24.04
 
+    permissions:
+      actions: write
+
     outputs:
       openvino_version: ${{ steps.openvino_version.outputs.value }}
 
@@ -409,7 +442,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-ubuntu-24.04-openvino-release-no-preset-v1
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Dependencies
         run: |
@@ -447,6 +479,11 @@ jobs:
             -DGGML_OPENVINO=ON
           cmake --build build/ReleaseOV --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-24.04-openvino-release-no-preset-v1
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -464,11 +501,14 @@ jobs:
           name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
 
   windows-cpu:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         include:
@@ -488,15 +528,14 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2025-${{ matrix.arch }}-cpu
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
-      - name: Install Ninja
-        run: |
-          choco install ninja
 
       - name: Build
         shell: cmd
@@ -512,6 +551,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-cpu
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -525,11 +569,14 @@ jobs:
           name: llama-bin-win-cpu-${{ matrix.arch }}.zip
 
   windows:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2025
 
+    permissions:
+      actions: write
+
     env:
       OPENBLAS_VERSION: 0.3.23
       VULKAN_VERSION: 1.4.313.2
@@ -558,12 +605,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Install Vulkan SDK
         id: get_vulkan
         if: ${{ matrix.backend == 'vulkan' }}
@@ -578,6 +619,11 @@ jobs:
         run: |
           choco install ninja
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+
       - name: Install OpenCL Headers and Libs
         id: install_opencl
         if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
@@ -604,6 +650,11 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
           cmake --build build --config Release --target ${{ matrix.target }}
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -616,11 +667,14 @@ jobs:
           name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         cuda: ['12.4', '13.3']
@@ -637,12 +691,6 @@ jobs:
           cache: "npm"
           cache-dependency-path: "tools/ui/package-lock.json"
 
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
-
       - name: Install Cuda Toolkit
         uses: ./.github/actions/windows-setup-cuda
         with:
@@ -653,6 +701,11 @@ jobs:
         run: |
           choco install ninja
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
       - name: Build
         id: cmake_build
         shell: cmd
@@ -669,6 +722,11 @@ jobs:
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
           cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -748,7 +806,6 @@ jobs:
 #        uses: ggml-org/ccache-action@v1.2.21
 #        with:
 #          key: release-windows-2022-x64-sycl
-#          append-timestamp: false # note: use this only with non-concurrent jobs!
 #
 #      - name: Build
 #        id: cmake_build
@@ -869,7 +926,6 @@ jobs:
 #        uses: ggml-org/ccache-action@v1.2.21
 #        with:
 #          key: release-ubuntu-24.04-sycl
-#          append-timestamp: false # note: use this only with non-concurrent jobs!
 #
 #      - name: Build
 #        id: cmake_build
@@ -903,11 +959,14 @@ jobs:
 #          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
 
   ubuntu-22-rocm:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: ubuntu-22.04
 
+    permissions:
+      actions: write
+
     strategy:
       matrix:
         include:
@@ -938,7 +997,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Dependencies
         id: depends
@@ -996,6 +1054,11 @@ jobs:
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
+
       - name: Determine tag name
         id: tag
         uses: ./.github/actions/get-tag-name
@@ -1016,11 +1079,14 @@ jobs:
           name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
 
   windows-hip:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
 
     runs-on: windows-2022
 
+    permissions:
+      actions: write
+
     env:
       HIPSDK_INSTALLER_VERSION: "26.Q1"
 
@@ -1060,7 +1126,6 @@ jobs:
         uses: ggml-org/ccache-action@v1.2.21
         with:
           key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          append-timestamp: false # note: use this only with non-concurrent jobs!
 
       - name: Install ROCm
         if: steps.cache-rocm.outputs.cache-hit != 'true'
@@ -1120,6 +1185,11 @@ jobs:
           cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
 
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+
       - name: Pack artifacts
         id: pack_artifacts
         run: |
@@ -1131,10 +1201,10 @@ jobs:
           path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
           name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
 
-  ios-xcode-build:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
-    runs-on: macos-15
+  ios-xcode:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
+    runs-on: macos-26
 
     steps:
       - name: Checkout code
@@ -1144,7 +1214,7 @@ jobs:
 
       - name: Setup Xcode
         run: |
-          sudo xcode-select -s /Applications/Xcode_16.4.app
+          sudo xcode-select -s /Applications/Xcode_26.4.app
 
       - name: Build
         id: cmake_build
@@ -1160,7 +1230,7 @@ jobs:
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
 
@@ -1281,9 +1351,9 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 
-  ui-build:
-    needs: [check_release]
-    if: ${{ needs.check_release.outputs.should_release == 'true' }}
+  ui:
+    needs: [check-release]
+    if: ${{ needs.check-release.outputs.should_release == 'true' }}
     uses: ./.github/workflows/ui-build.yml
 
   release:
@@ -1309,9 +1379,9 @@ jobs:
       #- ubuntu-24-sycl
       - android-arm64
       - macos-cpu
-      - ios-xcode-build
+      - ios-xcode
       #- openEuler-cann
-      - ui-build
+      - ui
 
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
diff --git a/.github/workflows/ui-build-self-hosted.yml b/.github/workflows/ui-build-self-hosted.yml
new file mode 100644
index 00000000000..e5d576cda62
--- /dev/null
+++ b/.github/workflows/ui-build-self-hosted.yml
@@ -0,0 +1,43 @@
+name: UI Build (self-hosted)
+
+on:
+  workflow_call:
+
+jobs:
+  build:
+    runs-on: [self-hosted, fast]
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Generate checksums
+        run: |
+          cd tools/ui/dist
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
+      - name: Upload built UI
+        uses: actions/upload-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+          retention-days: 1
diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml
index 2653afd06c7..92b0573fb8d 100644
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -5,7 +5,7 @@ on:
 
 jobs:
   build:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-slim
     env:
       BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml
index 8a0d991930c..cec0fa52a12 100644
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -20,7 +20,7 @@ jobs:
   publish:
     name: Publish UI Static Output
     needs: build
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-slim
 
     permissions:
       contents: read
diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml
index 8a97a8284e5..5457d900c87 100644
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -16,7 +16,7 @@ on:
       - master
     paths: [
       '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
+      '.github/workflows/ui-build-self-hosted.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
@@ -24,7 +24,7 @@ on:
     types: [opened, synchronize, reopened]
     paths: [
       '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
+      '.github/workflows/ui-build-self-hosted.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
@@ -42,7 +42,7 @@ concurrency:
 jobs:
   ui-build:
     name: Build static output
-    uses: ./.github/workflows/ui-build.yml
+    uses: ./.github/workflows/ui-build-self-hosted.yml
 
   ui-checks:
     name: Checks
diff --git a/CMakeLists.txt b/CMakeLists.txt
index edd0ea1ded0..9e7b1253c72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,19 +222,6 @@ if (LLAMA_BUILD_APP)
     add_subdirectory(app)
 endif()
 
-# Automatically add all files from the 'licenses' directory
-file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
-
-foreach(FILE_PATH ${EXTRA_LICENSES})
-    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
-    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
-    license_add_file("${NAME}" "${FILE_PATH}")
-endforeach()
-
-if (LLAMA_BUILD_COMMON)
-    license_generate(llama-common)
-endif()
-
 #
 # install
 #
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 6c53ce0e4e2..3ce503955b3 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -15,6 +15,17 @@ target_link_libraries(${TARGET} PRIVATE
 )
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+
+foreach(FILE_PATH ${EXTRA_LICENSES})
+    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+    license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+
+license_generate(${TARGET})
+
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
 endif()
diff --git a/app/llama.cpp b/app/llama.cpp
index b0b86fd47d9..30b09f9ef7e 100644
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -5,6 +5,9 @@
 #include <string>
 #include <vector>
 
+// embedded data generated by cmake
+extern const char * LICENSES[];
+
 // visible
 int llama_server(int argc, char ** argv);
 int llama_cli(int argc, char ** argv);
@@ -17,8 +20,23 @@ int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
 
+// hands the update over to the install script, which downloads and swaps the binary
+static int llama_update(int argc, char ** argv) {
+    (void) argc;
+    (void) argv;
+
+#if defined(_WIN32)
+    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
+#else
+    return system("curl -fsSL https://llama.app/install.sh | sh");
+#endif
+}
+
+static const char * progname;
+
 static int help(int argc, char ** argv);
 static int version(int argc, char ** argv);
+static int licenses(int argc, char ** argv);
 
 struct command {
     const char * name;
@@ -31,14 +49,16 @@ struct command {
 static const command cmds[] = {
     {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
     {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
     {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
     {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
     {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
     {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
     {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
     {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
-    {"version",       "Show version",                                       {},           true,  version            },
-    {"help",          "Show available commands",                            {},           true,  help               },
+    {"version",       "Show version",                                       {},           false, version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
+    {"help",          "Show available commands",                            {},           false, help               },
 };
 
 static int version(int argc, char ** argv) {
@@ -46,17 +66,29 @@ static int version(int argc, char ** argv) {
     return 0;
 }
 
+static int licenses(int argc, char ** argv) {
+    for (int i = 0; LICENSES[i]; ++i) {
+        printf("%s\n", LICENSES[i]);
+    }
+    return 0;
+}
+
 static int help(int argc, char ** argv) {
     const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
 
-    printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
+    printf("Usage: %s <command> [options]\n\nAvailable commands:\n", progname);
 
     for (const auto & cmd : cmds) {
         if (show_all || !cmd.hidden) {
             printf("  %-15s %s\n", cmd.name, cmd.desc);
         }
     }
-    printf("\nRun 'llama <command> --help' for command-specific usage.\n");
+    printf("\n");
+
+    if (!show_all) {
+        printf("Run '%s help all' to show additional commands.\n", progname);
+    }
+    printf("Run '%s <command> --help' for command-specific usage.\n", progname);
 
     return 0;
 }
@@ -74,13 +106,13 @@ static bool matches(const std::string & arg, const command & cmd) {
 }
 
 int main(int argc, char ** argv) {
+    progname = argv[0];
+
     const std::string arg = argc >= 2 ? argv[1] : "help";
 
     for (const auto & cmd : cmds) {
         if (matches(arg, cmd)) {
-
-            // router spawns children through this same binary, it needs the
-            // subcommand to relaunch as 'llama serve' and not bare options
+            // keep cmd.name so the router's child processes re-invoke correctly
 #ifdef _WIN32
             _putenv_s("LLAMA_APP_CMD", cmd.name);
 #else
diff --git a/build-xcframework.sh b/build-xcframework.sh
index d287d72fbd8..5d289922a84 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -8,6 +8,7 @@ TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
 LLAMA_BUILD_APP=OFF
+LLAMA_BUILD_COMMON=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -33,6 +34,7 @@ COMMON_CMAKE_ARGS=(
     -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
     -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
     -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
+    -DLLAMA_BUILD_COMMON=${LLAMA_BUILD_COMMON}
     -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
     -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
     -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
@@ -416,7 +418,7 @@ cmake -B build-ios-sim -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-ios-sim --config Release -- -quiet
+cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
@@ -430,7 +432,7 @@ cmake -B build-ios-device -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-ios-device --config Release -- -quiet
+cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for macOS..."
 cmake -B build-macos -G Xcode \
@@ -441,7 +443,7 @@ cmake -B build-macos -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-macos --config Release -- -quiet
+cmake --build build-macos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for visionOS..."
 cmake -B build-visionos -G Xcode \
@@ -456,7 +458,7 @@ cmake -B build-visionos -G Xcode \
     -DLLAMA_OPENSSL=OFF \
     -DLLAMA_BUILD_SERVER=OFF \
     -S .
-cmake --build build-visionos --config Release -- -quiet
+cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for visionOS simulator..."
 cmake -B build-visionos-sim -G Xcode \
@@ -471,7 +473,7 @@ cmake -B build-visionos-sim -G Xcode \
     -DLLAMA_OPENSSL=OFF \
     -DLLAMA_BUILD_SERVER=OFF \
     -S .
-cmake --build build-visionos-sim --config Release -- -quiet
+cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 # Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
 echo "Building for tvOS simulator..."
@@ -487,7 +489,7 @@ cmake -B build-tvos-sim -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-tvos-sim --config Release -- -quiet
+cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 echo "Building for tvOS devices..."
 cmake -B build-tvos-device -G Xcode \
@@ -502,7 +504,7 @@ cmake -B build-tvos-device -G Xcode \
     -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
     -DLLAMA_OPENSSL=OFF \
     -S .
-cmake --build build-tvos-device --config Release -- -quiet
+cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
 
 # Setup frameworks and copy binaries and headers
 echo "Setting up framework structures..."
diff --git a/common/arg.cpp b/common/arg.cpp
index bdc2e9eb4fc..e0f6c606608 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -50,8 +50,6 @@
 
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
-extern const char * LICENSES[];
-
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;
 
@@ -342,9 +340,7 @@ struct handle_model_result {
 };
 
 static handle_model_result common_params_handle_model(struct common_params_model & model,
-                                                      const std::string          & bearer_token,
-                                                      bool                         offline,
-                                                      bool                         search_mtp = false) {
+                                                      const common_download_opts & opts) {
     handle_model_result result;
 
     if (!model.docker_repo.empty()) {
@@ -356,10 +352,9 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.hf_file = model.path;
             model.path = "";
         }
-        common_download_opts opts;
-        opts.bearer_token = bearer_token;
-        opts.offline = offline;
-        auto download_result = common_download_model(model, opts, true, search_mtp);
+        common_download_opts hf_opts = opts;
+        hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model
+        auto download_result = common_download_model(model, hf_opts);
 
         if (download_result.model_path.empty()) {
             throw std::runtime_error("failed to download model from Hugging Face");
@@ -384,9 +379,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
 
-        common_download_opts opts;
-        opts.bearer_token = bearer_token;
-        opts.offline = offline;
         auto download_result = common_download_model(model, opts);
         if (download_result.model_path.empty()) {
             throw std::runtime_error("failed to download model from " + model.url);
@@ -443,35 +435,49 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //
 
-void common_params_handle_models(common_params & params, llama_example curr_ex) {
+bool common_params_handle_models(common_params & params, llama_example curr_ex) {
     const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
                                          params.speculative.types.end(),
                                          COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
 
-    auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_draft_mtp);
-    if (params.no_mmproj) {
-        params.mmproj = {};
-    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-        // optionally, handle mmproj model when -hf is specified
-        params.mmproj = res.mmproj;
-    }
-    // only download mmproj if the current example is using it
-    for (const auto & ex : mmproj_examples) {
-        if (curr_ex == ex) {
-            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-            break;
+    common_download_opts opts;
+    opts.bearer_token  = params.hf_token;
+    opts.offline       = params.offline;
+    opts.skip_download = params.skip_download;
+    opts.download_mtp  = spec_type_draft_mtp;
+
+    try {
+        auto res = common_params_handle_model(params.model, opts);
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
+            params.mmproj = res.mmproj;
+        }
+        // only download mmproj if the current example is using it
+        for (const auto & ex : mmproj_examples) {
+            if (curr_ex == ex) {
+                common_params_handle_model(params.mmproj, opts);
+                break;
+            }
         }
+
+        // when --spec-type mtp is set and no draft model was provided explicitly,
+        // fall back to the MTP head discovered alongside the -hf model
+        if (spec_type_draft_mtp && res.found_mtp &&
+            params.speculative.draft.mparams.path.empty() &&
+            params.speculative.draft.mparams.hf_repo.empty() &&
+            params.speculative.draft.mparams.url.empty()) {
+            params.speculative.draft.mparams.path = res.mtp.path;
+        }
+        common_params_handle_model(params.speculative.draft.mparams, opts);
+        common_params_handle_model(params.vocoder.model,             opts);
+        return true;
+    } catch (const common_skip_download_exception &) {
+        return false;
+    } catch (const std::exception &) {
+        throw;
     }
-    // when --spec-type mtp is set and no draft model was provided explicitly,
-    // fall back to the MTP head discovered alongside the -hf model
-    if (spec_type_draft_mtp && res.found_mtp &&
-        params.speculative.draft.mparams.path.empty() &&
-        params.speculative.draft.mparams.hf_repo.empty() &&
-        params.speculative.draft.mparams.url.empty()) {
-        params.speculative.draft.mparams.path = res.mtp.path;
-    }
-    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
 }
 
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
@@ -1091,16 +1097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             exit(0);
         }
     ));
-    add_opt(common_arg(
-        {"--license"},
-        "show source code license and dependencies",
-        [](common_params &) {
-            for (int i = 0; LICENSES[i]; ++i) {
-                printf("%s\n", LICENSES[i]);
-            }
-            exit(0);
-        }
-    ));
     add_opt(common_arg(
         {"-cl", "--cache-list"},
         "show list of models in cache",
@@ -2998,7 +2994,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             key_file.close();
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
     add_opt(common_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
diff --git a/common/arg.h b/common/arg.h
index 2a85f09f3eb..0010f2a9ac9 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -129,8 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);
 
-// Populate model paths (main model, mmproj, etc) from -hf if necessary
-void common_params_handle_models(common_params & params, llama_example curr_ex);
+// populate model paths (main model, mmproj, etc) from -hf if necessary
+// return true if the model is ready to use
+// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
+// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
+bool common_params_handle_models(common_params & params, llama_example curr_ex);
 
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/common.h b/common/common.h
index 8a0e5eed5ee..99898800d1d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -479,7 +479,7 @@ struct common_params {
 
     std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
     std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
-    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_token             = ""; // HF token (aka bearer token)                                   // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -507,6 +507,7 @@ struct common_params {
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
     bool    offline                    = false;
+    bool    skip_download              = false; // skip model file downloading
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -587,7 +588,7 @@ struct common_params {
     // server params
     int32_t port                = 8080;          // server listens on this network port
     bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
-    int32_t timeout_read        = 600;           // http read timeout in seconds
+    int32_t timeout_read        = 3600;          // http read timeout in seconds
     int32_t timeout_write       = timeout_read;  // http write timeout in seconds
     int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
diff --git a/common/download.cpp b/common/download.cpp
index 103bc408faf..40f6eb780f4 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,
 
     const bool file_exists = std::filesystem::exists(path);
 
+    if (!file_exists && opts.skip_download) {
+        return -2; // file is missing and download is disabled
+    }
+
     if (file_exists && skip_etag) {
         LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
         return 304; // 304 Not Modified - fake cached response
@@ -357,6 +361,10 @@ static int common_download_file_single_online(const std::string & url,
             LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
             return 304; // 304 Not Modified - fake cached response
         }
+        // pass this point, the file exists but is different from the server version, so we need to redownload it
+        if (opts.skip_download) {
+            return -2; // special code to indicate that the download was skipped due to etag mismatch
+        }
         if (remove(path.c_str()) != 0) {
             LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
             return -1;
@@ -775,13 +783,13 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
 }
 
 common_download_model_result common_download_model(const common_params_model  & model,
-                                                   const common_download_opts & opts,
-                                                   bool download_mmproj,
-                                                   bool download_mtp) {
+                                                   const common_download_opts & opts) {
     common_download_model_result result;
     std::vector<download_task> tasks;
     hf_plan hf;
 
+    bool download_mmproj = opts.download_mmproj;
+    bool download_mtp = opts.download_mtp;
     bool is_hf = !model.hf_repo.empty();
 
     if (is_hf) {
@@ -806,18 +814,22 @@ common_download_model_result common_download_model(const common_params_model  &
         return result;
     }
 
-    std::vector<std::future<bool>> futures;
+    std::vector<std::future<int>> futures;
     for (const auto & task : tasks) {
         futures.push_back(std::async(std::launch::async,
             [&task, &opts, is_hf]() {
-                int status = common_download_file_single(task.url, task.path, opts, is_hf);
-                return is_http_status_ok(status);
+                return common_download_file_single(task.url, task.path, opts, is_hf);
             }
         ));
     }
 
     for (auto & f : futures) {
-        if (!f.get()) {
+        int status = f.get();
+        if (status == -2 && opts.skip_download) {
+            throw common_skip_download_exception();
+        }
+        bool is_ok = is_http_status_ok(status);
+        if (!is_ok) {
             return {};
         }
     }
diff --git a/common/download.h b/common/download.h
index 4a169ef7796..ebeedd6058c 100644
--- a/common/download.h
+++ b/common/download.h
@@ -52,6 +52,9 @@ struct common_download_opts {
     std::string bearer_token;
     common_header_list headers;
     bool offline = false;
+    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
+    bool download_mmproj = false;
+    bool download_mtp = false;
     common_download_callback * callback = nullptr;
 };
 
@@ -62,6 +65,11 @@ struct common_download_model_result {
     std::string mtp_path;
 };
 
+// throw if the file is missing or invalid (e.g. ETag check failed)
+struct common_skip_download_exception : public std::runtime_error {
+    common_skip_download_exception() : std::runtime_error("skip download") {}
+};
+
 // Download model from HuggingFace repo or URL
 //
 // input (via model struct):
@@ -89,9 +97,7 @@ struct common_download_model_result {
 // returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
 common_download_model_result common_download_model(
     const common_params_model & model,
-    const common_download_opts & opts = {},
-    bool download_mmproj = false,
-    bool download_mtp    = false
+    const common_download_opts & opts = {}
 );
 
 // returns list of cached models
@@ -99,6 +105,7 @@ std::vector<common_cached_model_info> common_list_cached_models();
 
 // download single file from url to local path
 // returns status code or -1 on error
+// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                 const std::string & path,
diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
index 76f7257f611..1b5a09a5eb6 100644
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@@ -1,5 +1,7 @@
 #include "ngram-mod.h"
 
+#include <algorithm>
+
 //
 // common_ngram_mod
 //
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2a87bd75b44..cfaa24ba1a1 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
     "DeepseekForCausalLM": "deepseek",
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
+    "DeepseekV32ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
     "DistilBertModel": "bert",
@@ -236,6 +237,7 @@
 MMPROJ_MODEL_MAP: dict[str, str] = {
     "AudioFlamingo3ForConditionalGeneration": "ultravox",
     "CogVLMForCausalLM": "cogvlm",
+    "DeepseekOCR2ForCausalLM": "deepseek",
     "DeepseekOCRForCausalLM": "deepseek",
     "DotsOCRForCausalLM": "dotsocr",
     "Gemma3ForConditionalGeneration": "gemma",
diff --git a/conversion/base.py b/conversion/base.py
index 9cddd1340f7..866625a8045 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -915,6 +915,8 @@ def load():
                             gguf.MODEL_TENSOR.SSM_CONV1D_Q,
                             gguf.MODEL_TENSOR.SSM_CONV1D_K,
                             gguf.MODEL_TENSOR.SSM_CONV1D_V,
+                            # DSA indexer weights should be F32
+                            gguf.MODEL_TENSOR.INDEXER_PROJ,
                         )
                     )
                     or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
@@ -1138,7 +1140,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         # Skip multimodal tensors
         if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
                 or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
-                or "vision_" in name or "audio_" in name or "sam_model" in name \
+                or "vision_" in name or "audio_" in name \
                 or "token2wav." in name or "code2wav." in name \
                 or "projector." in name or "pre_mm_projector_norm" in name \
                 or "image_newline" in name or "view_seperator" in name \
@@ -1445,6 +1447,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
             # ref: https://huggingface.co/evilfreelancer/ruGPT3XL
             res = "gpt-2"
+        if chkhsh == "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7":
+            # ref: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B
+            res = "lfm2"
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
@@ -1596,7 +1601,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
             res = "midm-2.0"
         if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
-            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
+            # ref: https://huggingface.co/LiquidAI/LFM2.5-350M
             res = "lfm2"
         if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752..72520cc9f6a 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -16,10 +16,14 @@
 
 @ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
+        self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
         # default values below are taken from HF tranformers code
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
         self.gguf_writer.add_vision_use_gelu(True)
@@ -49,22 +53,27 @@ def get_vision_config(self) -> dict[str, Any]:
             raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
 
         vision_config['sam'] = vision_config['width']['sam_vit_b']
-        vision_config.update(vision_config['width']['clip-l-14-224'])
-        vision_config['hidden_size'] = vision_config['width']
-        vision_config['num_heads'] = vision_config['heads']
-        vision_config['intermediate_size'] = vision_config['heads'] * 4
+        if vision_config['width'].get('clip-l-14-224') is not None:
+            vision_config.update(vision_config['width']['clip-l-14-224'])
+        if isinstance(vision_config['width'], int):
+            vision_config['hidden_size'] = vision_config['width']
+        if vision_config.get('heads') is not None:
+            vision_config['num_heads'] = vision_config['heads']
+            vision_config['intermediate_size'] = vision_config['heads'] * 4
 
         return vision_config
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".embeddings." in name or 'pos_embed' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".rel_pos_h" in name or '.rel_pos_w' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".neck." in name or ".net_" in name:
-            return gguf.GGMLQuantizationType.F32
+        for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
+            if nq_name in name:
+                return gguf.GGMLQuantizationType.F32
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("view_seperator"):
+            data_torch = data_torch.unsqueeze(0)
+        yield from super().modify_tensors(data_torch, name, bid)
+
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
@@ -81,6 +90,33 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         return super().filter_tensors((name, gen))
 
 
+@ModelBase.register("DeepseekOCR2ForCausalLM")
+class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
+
+    def set_gguf_parameters(self):
+        # the vision tower's qwen2 encoder is built from fixed defaults,
+        # see build_qwen2_decoder_as_encoder() in deepencoderv2.py
+        if self.hparams.get("patch_size") is None:
+            self.hparams["patch_size"] = 16
+        if self.hparams.get("intermediate_size") is None:
+            self.hparams["intermediate_size"] = 4864
+        if self.hparams.get("num_attention_heads") is None:
+            self.hparams["num_attention_heads"] = 14
+        super().set_gguf_parameters()
+        # qwen2 encoder is GQA: 14 Q heads, 2 KV heads
+        self.gguf_writer.add_vision_head_count_kv(2)
+
+    def get_vision_config(self) -> dict[str, Any]:
+        vision_config = super().get_vision_config()
+        vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
+        if vision_config.get('layers') is None:
+            vision_config['layers'] = 24
+        return vision_config
+
+
 @ModelBase.register("DeepseekForCausalLM")
 class DeepseekModel(TextModel):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK
@@ -188,13 +224,21 @@ def __init__(self, *args, **kwargs):
         self.origin_hf_arch = hparams.get('architectures', [None])[0]
 
         # special handling for Deepseek OCR
-        if self.origin_hf_arch == "DeepseekOCRForCausalLM":
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
             self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
             self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
             self.gguf_writer.add_architecture()
             # default jinja template
             self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
 
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        # DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
+        if "sam_model" in name or "qwen2_model" in name:
+            return None
+        return super().filter_tensors(item)
+
     def set_vocab(self):
         try:
             self._set_vocab_gpt2()
@@ -386,3 +430,32 @@ def prepare_tensors(self):
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV32ForCausalLM")
+class DeepseekV32Model(DeepseekV2Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK32
+    skip_mtp = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_vocab(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        assert getattr(tokenizer, "add_bos_token", False), "Change value of add_bos_token to true in tokenizer_config.json file."
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+        # DSA indexer parameters
+        self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
+        self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
+        self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
diff --git a/conversion/gemma.py b/conversion/gemma.py
index 1b427a30cd5..76beedcf0d3 100644
--- a/conversion/gemma.py
+++ b/conversion/gemma.py
@@ -786,14 +786,15 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
 
         # vision params
+        assert self.hparams_vision is not None
         self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
 
         # audio params
-        if self.hparams_audio:
-            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-            self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
 
     def is_audio_tensor(self, name: str) -> bool:
         return "audio_tower" in name or "embed_audio" in name
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 66aa1cb2fc0..827af277b92 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,7 +139,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
     {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
     {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
-    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
+    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-350M", },
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
     {"name": "modern-bert",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
@@ -183,6 +183,8 @@ class TOKENIZER_TYPE(IntEnum):
     # jina-v2-de variants
     {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
     {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
+    # lfm2 variants
+    {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-8B-A1B", "chkhsh": "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7"},
 ]
 
 
diff --git a/docs/speculative.md b/docs/speculative.md
index 041ff58038d..43d18185891 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -323,3 +323,8 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
+
+## Benchmarking
+
+To measure the end-to-end effect of speculative decoding (throughput, latency, and draft acceptance) across diverse prompts, see the SPEED-Bench client in [tools/server/bench/speed-bench](../tools/server/bench/speed-bench/README.md).
+It runs against a running `llama-server` and can compare a baseline run against a speculative-decoding run.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index f542f18b6d4..dc8899b46ef 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 13)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index d0d64523b4a..48b2027fac3 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -2076,6 +2076,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
             node_zero->src[0] = node;
             ggml_set_op_params_f32(node_zero, 0, 0.0f);
             node_zero->data = node->data;
+            node_zero->buffer = node->buffer;
             node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
 
             step_cgraphs[j] = get_cgraph_aux();
diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c
index 74e0c086c6d..9c43da6cf89 100644
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -977,6 +977,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     sumf = hsum_float_8(acc);
 
     *s = sumf;
+
+#elif defined(__loongarch_sx)
+
+    __m128 acc = (__m128)__lsx_vldi(0);
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128i qx_0 = __lsx_vld((const __m128i *)x[ib].qs, 0);
+        const __m128i qx_1 = __lsx_vld((const __m128i *)x[ib].qs + 1, 0);
+        const __m128i qy_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
+        const __m128i qy_1 = __lsx_vld((const __m128i *)y[ib].qs + 1, 0);
+
+        const __m128i p16_0 = lsx_maddubs_h(qx_0, qy_0);
+        const __m128i p16_1 = lsx_maddubs_h(qx_1, qy_1);
+
+        // Sum int16 pairs → int32
+        const __m128i s_0 = __lsx_vaddwev_w_h(p16_0, p16_1);
+        const __m128i s_1 = __lsx_vaddwod_w_h(p16_0, p16_1);
+
+        const __m128 q = __lsx_vffint_s_w(__lsx_vadd_w(s_0, s_1));
+        acc = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(d), q, acc);
+    }
+
+    __m128 res = lsx_hadd_s(acc, acc);
+    res = lsx_hadd_s(res, res);
+    sumf = ((v4f32)res)[0];
+
+    *s = sumf;
+
 #else
     UNUSED(nb);
     UNUSED(ib);
@@ -1443,6 +1472,99 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = hsum_float_8(acc);
 
+#elif defined(__loongarch_sx)
+
+    const __m128i m32s = __lsx_vreplgr2vr_b(32);
+
+    __m128 acc_0 = (__m128)__lsx_vldi(0);
+    __m128 acc_1 = (__m128)__lsx_vldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i scale_i8 = __lsx_vld(x[i].scales, 0);
+        const __m128i scales_lo = __lsx_vsllwil_h_b(scale_i8, 0);
+        const __m128i scales_hi = __lsx_vsllwil_h_b(__lsx_vbsrl_v(scale_i8, 8), 0);
+
+        __m128i sumi_0 = __lsx_vldi(0);
+        __m128i sumi_1 = __lsx_vldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
+            const __m128i q4bitsH_1 = __lsx_vld((const __m128i*)qh, 0); qh += 16;
+
+            const __m128i q4h_0 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3), 4);
+            const __m128i q4h_1 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3), 4);
+            const __m128i q4h_2 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3 << 2), 2);
+            const __m128i q4h_3 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3 << 2), 2);
+            const __m128i q4h_4 = __lsx_vandi_b(q4bitsH_0, 3 << 4);
+            const __m128i q4h_5 = __lsx_vandi_b(q4bitsH_1, 3 << 4);
+            const __m128i q4h_6 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_0, 3 << 6), 2);
+            const __m128i q4h_7 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_1, 3 << 6), 2);
+
+            const __m128i q4bits1_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits1_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits2_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+            const __m128i q4bits2_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16;
+
+            const __m128i q4_0 = __lsx_vor_v(__lsx_vandi_b(q4bits1_0, 0xf), q4h_0);
+            const __m128i q4_1 = __lsx_vor_v(__lsx_vandi_b(q4bits1_1, 0xf), q4h_1);
+            const __m128i q4_2 = __lsx_vor_v(__lsx_vandi_b(q4bits2_0, 0xf), q4h_2);
+            const __m128i q4_3 = __lsx_vor_v(__lsx_vandi_b(q4bits2_1, 0xf), q4h_3);
+            const __m128i q4_4 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_0, 4), q4h_4);
+            const __m128i q4_5 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_1, 4), q4h_5);
+            const __m128i q4_6 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_0, 4), q4h_6);
+            const __m128i q4_7 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_1, 4), q4h_7);
+
+            const __m128i q8_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_2 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_3 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_4 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_5 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_6 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8_7 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+
+            __m128i p16_0 = lsx_maddubs_h(__lsx_vsub_b(q4_0, m32s), q8_0);
+            __m128i p16_1 = lsx_maddubs_h(__lsx_vsub_b(q4_1, m32s), q8_1);
+            __m128i p16_2 = lsx_maddubs_h(__lsx_vsub_b(q4_2, m32s), q8_2);
+            __m128i p16_3 = lsx_maddubs_h(__lsx_vsub_b(q4_3, m32s), q8_3);
+            __m128i p16_4 = lsx_maddubs_h(__lsx_vsub_b(q4_4, m32s), q8_4);
+            __m128i p16_5 = lsx_maddubs_h(__lsx_vsub_b(q4_5, m32s), q8_5);
+            __m128i p16_6 = lsx_maddubs_h(__lsx_vsub_b(q4_6, m32s), q8_6);
+            __m128i p16_7 = lsx_maddubs_h(__lsx_vsub_b(q4_7, m32s), q8_7);
+
+            const __m128i sc_vec = j == 0 ? scales_lo : scales_hi;
+
+            p16_0 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 0), p16_0);
+            p16_1 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 1), p16_1);
+            p16_2 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 2), p16_2);
+            p16_3 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 3), p16_3);
+            p16_4 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 4), p16_4);
+            p16_5 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 5), p16_5);
+            p16_6 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 6), p16_6);
+            p16_7 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 7), p16_7);
+
+            sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_0, p16_2));
+            sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_1, p16_3));
+            sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_4, p16_6));
+            sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_5, p16_7));
+        }
+
+        __m128 p_0 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_0));
+        __m128 p_1 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_1));
+        acc_0 = __lsx_vfadd_s(p_0, acc_0);
+        acc_1 = __lsx_vfadd_s(p_1, acc_1);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, (__m128)__lsx_vldi(0), (__m128)__lsx_vldi(0));
+
 #else
     UNUSED(x);
     UNUSED(y);
@@ -2149,6 +2271,35 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
 
     *s = hsum_float_8(accum);
 
+#elif defined(__loongarch_sx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+
+    __m128 accum = (__m128)__lsx_vldi(0);
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m128i sumi = __lsx_vldi(0);
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const __m128i q4bits = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+            const __m128i q8b_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q8b_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16;
+            const __m128i q4b_0 = __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits, 0xf));
+            const __m128i q4b_1 = __lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits, 4));
+            const __m128i p16_0 = lsx_maddubs_h(q4b_0, q8b_0);
+            const __m128i p16_1 = lsx_maddubs_h(q4b_1, q8b_1);
+            const int16_t ls = (((x[ibl].scales_l[ib/2] >> ((ib & 1) * 4)) & 0xf) | ((sh & 0x3) << 4)) - 32;
+            sh >>= 2;
+            sumi = __lsx_vadd_w(lsx_madd_h(p16_0, __lsx_vreplgr2vr_h(ls)), sumi);
+            sumi = __lsx_vadd_w(lsx_madd_h(p16_1, __lsx_vreplgr2vr_h(ls)), sumi);
+        }
+        const float ds = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        accum = __lsx_vfadd_s(__lsx_vfmul_s(__lsx_vreplfr2vr_s(ds), __lsx_vffint_s_w(sumi)), accum);
+    }
+
+    *s = ((v4f32)lsx_hadd_s(lsx_hadd_s(accum, accum), lsx_hadd_s(accum, accum)))[0];
+
 #else
     UNUSED(x);
     UNUSED(y);
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc86..dc73696ad9f 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -2235,8 +2235,42 @@ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, gg
     }
 }
 
+static void ggml_compute_forward_fill_f16(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
+
+    GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb, dst, nb);
+
+    const auto [ir0, ir1] = get_thread_range(params, dst);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne2*ne1);
+        const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
+        const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
+
+        ggml_vec_set_f16(ne0, dst_ptr, c);
+    }
+}
+
 void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
-    ggml_compute_forward_fill_f32(params, dst);
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_fill_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_fill_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("unsupported type for ggml_compute_forward_fill: %s", ggml_type_name(src0->type));
+            }
+    }
 }
 
 // ggml_compute_tri
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 0deda930985..62e687201ef 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -1125,25 +1125,12 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F16_EPR  4
 
 static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
-    float tmp[4];
-
-    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
-
-    return (__m128)__lsx_vld(tmp, 0);
+    return __lsx_vfcvtl_s_h(__lsx_vld((const void *)x, 0));
 }
 
 static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
-    float arr[4];
-
-    __lsx_vst(y, arr, 0);
-
-    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
+    __m128i a = __lsx_vfcvt_h_s(y, y);
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 4);
 }
 
 #define GGML_F32Cx4             __m128
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 50d7763dcdd..560fab0b17b 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <mutex>
 
 #if defined(GGML_USE_HIP)
 #define GGML_COMMON_DECL_HIP
@@ -1552,6 +1553,62 @@ struct ggml_cuda_pdl_config {
     ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
 
 };
+
+static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
+    const int device = ggml_cuda_get_device();
+
+    struct cache_key {
+        int          device;
+        const void * kernel;
+
+        bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; }
+    };
+
+    struct cache_key_hash {
+        // MurmurHash3 mixing function for better hash distribution (vs. just std::hash which in some implementations simply returns the identity)
+        static size_t hash_mix(size_t x) {
+            std::uint64_t       y = x;
+            const std::uint64_t m = 0xe9846af9b1a615d;
+
+            y ^= y >> 32;
+            y *= m;
+            y ^= y >> 32;
+            y *= m;
+            y ^= y >> 28;
+
+            return static_cast<size_t>(y);
+        }
+
+        size_t operator()(const cache_key & key) const {
+            // Use a nonzero seed to avoid mapping all-zero keys to zero
+            size_t h = 42;
+            h        = hash_mix(h + key.device);
+            h        = hash_mix(h + reinterpret_cast<size_t>(key.kernel));
+            return h;
+        }
+    };
+
+    static std::mutex                                          cache_mutex;
+    static std::unordered_map<cache_key, bool, cache_key_hash> cache;
+
+    const cache_key             key = { device, kernel };
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    const auto                  it = cache.find(key);
+    if (it != cache.end()) {
+        return it->second;
+    }
+
+    cudaFuncAttributes attr = {};
+    CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel));
+
+    // PDL device-side primitives are emitted only for PTX versions >= 90.
+    // We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed
+    // from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL.
+    const bool can_use_pdl = attr.ptxVersion >= 90;
+    cache.emplace(key, can_use_pdl);
+    return can_use_pdl;
+}
+
 #endif //defined(GGML_CUDA_USE_PDL)
 
 
@@ -1564,8 +1621,7 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
         return env == nullptr || std::atoi(env) != 0;
     }();
 
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
+    if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast<const void *>(kernel))) {
         auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
 
         CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index debcb6e5447..d650b5fbd0f 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -1153,8 +1153,8 @@ void launch_fattn(
 
     GGML_ASSERT(block_dim.x % warp_size == 0);
 
-    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
-    ggml_cuda_kernel_launch(fattn_kernel, launch_params,
+        // disabled PDL enrollment for now due to a compiler bug.
+        fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
         (const char *) Q->data,
         K_data,
         V_data,
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 23d1c069248..18aaa098398 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2570,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
             use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
             use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+            use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
@@ -2578,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
         use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
         use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
+        use_mul_mat_vec_q       = use_mul_mat_vec_q         && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
 
@@ -4992,8 +4994,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
+
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
+
+    return prop.integrated
+        ? GGML_BACKEND_DEVICE_TYPE_IGPU
+        : GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 13b8b855282..ecb6fdedadd 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -63,6 +63,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
 
 enum mmvq_parameter_table_id {
     MMVQ_PARAMETERS_GENERIC = 0,
+    MMVQ_PARAMETERS_TURING,
     MMVQ_PARAMETERS_GCN,
     MMVQ_PARAMETERS_RDNA2,
     MMVQ_PARAMETERS_RDNA3_0,
@@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
     return MMVQ_PARAMETERS_RDNA2;
 #elif defined(GCN) || defined(CDNA)
     return MMVQ_PARAMETERS_GCN;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && __CUDA_ARCH__ < GGML_CUDA_CC_AMPERE
+    return MMVQ_PARAMETERS_TURING;
 #else
     return MMVQ_PARAMETERS_GENERIC;
 #endif
@@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
     if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
         return MMVQ_PARAMETERS_GCN;
     }
+    if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_AMPERE) {
+        return MMVQ_PARAMETERS_TURING;
+    }
     return MMVQ_PARAMETERS_GENERIC;
 }
 
@@ -271,6 +277,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
     return MMVQ_MAX_BATCH_SIZE;
 }
 
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
+        if (GGML_CUDA_CC_IS_CDNA1(cc)) {
+            switch (type) {
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q5_1:
+                    return ne11 <= 7;
+                case GGML_TYPE_Q8_0:
+                    return ne11 <= 6;
+                case GGML_TYPE_Q2_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_Q3_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q4_K:
+                    return ne11 <= 2;
+                case GGML_TYPE_Q5_K:
+                    return ne11 <= 3;
+                case GGML_TYPE_Q6_K:
+                    return ne11 <= 4;
+                case GGML_TYPE_IQ1_S:
+                    return ne11 <= 5;
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ3_S:
+                case GGML_TYPE_IQ4_XS:
+                    return ne11 <= 6;
+                default:
+                    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+            }
+        }
+        switch (type) { // tuned for CDNA2
+            case GGML_TYPE_Q2_K:
+                return ne11 <= 5;
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_Q4_K:
+            case GGML_TYPE_Q5_K:
+                return ne11 <= 3;
+            case GGML_TYPE_Q6_K:
+                return ne11 <= 5;
+            default:
+                return ne11 <= MMVQ_MAX_BATCH_SIZE;
+        }
+    }
+    return ne11 <= MMVQ_MAX_BATCH_SIZE;
+}
+
 // Device constexpr: returns the max batch size for the current arch+type at compile time.
 template <ggml_type type>
 static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
@@ -370,11 +423,38 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
         }
         return 1;
     }
+    if (table_id == MMVQ_PARAMETERS_TURING) {
+        if (ncols_dst == 1) {
+            switch (type) {
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                    return 2;
+                default:
+                    return 4;
+            }
+        }
+        switch (ncols_dst) {
+            case 2:
+            case 3:
+            case 4:
+                return 4;
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+                return 2;
+            default:
+                return 1;
+        }
+    }
     return 1;
 }
 
 static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) {
-    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
+    if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN || table_id == MMVQ_PARAMETERS_TURING) {
         switch (ncols_dst) {
             case 1:
                 return small_k ? nwarps : 1;
diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
index 6bf0a8e8677..5605bf7a4e6 100644
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -2,6 +2,8 @@
 
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 
+bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
+
 // Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
 // based on the quantization type and GPU architecture (compute capability).
 int get_mmvq_mmid_max_batch(ggml_type type, int cc);
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 3af7aff7028..48ded82e83c 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -39,7 +39,7 @@
 #include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
-#include "op-desc.h"
+#include "htp-opnode.h"
 #include "htp-ops.h"
 #include "htp_iface.h"
 #include "htp-drv.h"
@@ -102,23 +102,23 @@ static const char * status_to_str(uint32_t status) {
 
 // ** debug helpers
 
-static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
+static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) {
     if (!opt_verbose) return;
 
-    op_desc desc(op);
+    htp_opformat fmt(node);
     GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
+                node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags);
 }
 
 static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
     if (!opt_verbose) return;
 
-    op_desc desc(op);
+    htp_opformat fmt(htp_opformat(htp_opnode{const_cast<ggml_tensor*>(op), {}, HTP_OP_INVALID}));
     GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
+                ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no");
 }
 
-static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
+static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node,
                                       uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
     if (!opt_profile) return;
 
@@ -129,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t
                 pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
     }
 
-    op_desc desc(op);
+    htp_opformat fmt(node);
     GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
-            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
+            node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str);
 }
 
 // ** backend sessions
 
 struct ggml_hexagon_opbatch;
 struct ggml_hexagon_opqueue;
+struct htp_opnode;
 
 struct ggml_hexagon_session {
     std::string      name;
@@ -167,7 +168,7 @@ struct ggml_hexagon_session {
     void allocate(int dev_id) noexcept(false);
     void release() noexcept(true);
 
-    void enqueue_op(htp_op_code opcode, const ggml_tensor *op);
+    void enqueue_op(const htp_opnode & node);
     void flush(bool all = true);
 
     void flush_pending(bool all = false);
@@ -1782,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
     /* .is_host          = */ ggml_backend_hexagon_repack_buffer_type_is_host,
 };
 
-// Backend session implementation
-
 struct ggml_hexagon_opbatch {
     ggml_hexagon_session*            sess;
 
-    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+    std::vector<htp_opnode>          ops;       // htp_opnode of ops
 
     std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
     std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
@@ -1919,7 +1918,7 @@ struct ggml_hexagon_opbatch {
         return ti;
     }
 
-    bool fit_op(const struct ggml_tensor *t) const {
+    bool fit_op(const htp_opnode & node) const {
         if (n_ops >= n_ops_max ) return false;
 
         // check how much extras we will need
@@ -1939,10 +1938,10 @@ struct ggml_hexagon_opbatch {
             }
         };
 
-        for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) {
-            fit_tensor(t->src[i]);
+        for (const auto * src : node.get_inputs()) {
+            fit_tensor(src);
         }
-        fit_tensor(t);
+        fit_tensor(node.dst());
 
         if ((extra_bufs + n_bufs) > n_bufs_max) return false;
         if ((extra_tens + n_tens) > n_tens_max) return false;
@@ -1952,29 +1951,30 @@ struct ggml_hexagon_opbatch {
     }
 
     // assumes that fit_op() was called first and returned true
-    void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
+    void add_op(const htp_opnode & node) {
         // Add new op
 
         unsigned int n = n_ops++;
         GGML_ASSERT(n_ops <= n_ops_max);
 
-        ops[n] = t;
+        ops[n] = node;
 
         htp_op_desc &o = h_ops[n];
-        memcpy(&o.params, &t->op_params, sizeof(t->op_params));
-        o.opcode = opcode;
+        memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params));
+        o.opcode = node.opcode;
         o.flags  = 0;
 
         if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
             o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
         }
 
-        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags);
 
+        auto inputs = node.get_inputs();
         for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
-            o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
+            o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff;
         }
-        o.dst = add_tensor(t);
+        o.dst = add_tensor(node.dst());
     }
 };
 
@@ -1983,7 +1983,7 @@ struct ggml_hexagon_opqueue {
     ggml_hexagon_shared_buffer *shm_buf;
     size_t                      shm_blk_size;
 
-    using opvec = std::vector<const ggml_tensor*>;
+    using opvec = std::vector<htp_opnode>;
 
     std::queue<unsigned int>    done;       // completed batch ids
     std::vector<opvec>          op_cache;   // per batch op cache
@@ -2182,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() {
     }
 }
 
-void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) {
-    if (!op_batch->fit_op(op)) {
+void ggml_hexagon_session::enqueue_op(const htp_opnode & node) {
+    if (!op_batch->fit_op(node)) {
         flush_batch();
     }
-    op_batch->add_op(opcode, op);
+    op_batch->add_op(node);
 }
 
 // Flush HTP response queue i.e wait for all outstanding requests to complete
@@ -3179,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
     HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
 
+    std::vector<htp_opnode> nodes;
+    nodes.reserve(graph->n_nodes);
+
+    // Fusion
     for (int i = 0; i < graph->n_nodes; ++i) {
         ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
-            sess->enqueue_op(op_remap_to_htp(n), n);
+        if (!op_is_compute(n)) {
+            continue;
+        }
+
+        ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr;
+
+        htp_opnode node = {
+            /*.node =*/ n,
+            /*.fused =*/ {},
+            /*.opcode =*/ HTP_OP_INVALID
+        };
+
+        if (n->op == GGML_OP_RMS_NORM && next_node) {
+            if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+                node.add_fused(next_node);
+                node.opcode = HTP_OP_RMS_NORM_MUL;
+                i++; // skip the fused MUL node
+            }
+        }
+
+        if (node.opcode == HTP_OP_INVALID) {
+            node.opcode = op_remap_to_htp(n);
+        }
+
+        nodes.push_back(std::move(node));
+    }
+
+    // Queue and execute
+    if (opt_opstage & HTP_OPSTAGE_QUEUE) {
+        for (const auto & node : nodes) {
+            sess->enqueue_op(node);
         }
     }
 
@@ -3201,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     sess->flush();
 }
 
-struct node_info {
-    ggml_tensor * node;
-
-    std::vector<ggml_tensor *> fused;
-
-    ggml_op op() const {
-        return node->op;
-    }
-
-    const ggml_tensor * dst() const {
-        return fused.empty() ? node : fused.back();
-    }
-
-    const ggml_tensor * src0() const {
-        return node->src[0];
-    }
-
-    const ggml_tensor * src1() const {
-        return node->src[1];
-    }
-
-    bool is_empty() const {
-        return ggml_op_is_empty(node->op);
-    }
-
-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
-
-    bool stackable() const {
-        switch (this->op()) {
-            case GGML_OP_MUL_MAT:
-            case GGML_OP_MUL_MAT_ID:
-                return ggml_is_quantized(this->src0()->type);
-            default:
-                return false;
-        }
-    }
-
-    bool same_input(const node_info& n) const {
-        return n.src1() == this->src1();
-    }
-};
-
-static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<htp_opnode> & nodes) {
     const int n = nodes.size();
 
     std::vector<int> res;
@@ -3299,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
 
     enum ggml_op ops[MAX_FUSE];
 
-    std::vector<node_info> nodes;
+    std::vector<htp_opnode> nodes;
     nodes.reserve(gf->n_nodes);
 
     // fuse nodes:
     // we don't want to make reorders that break fusing, so we first pack all fusable tensors
     //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
     for (int i = 0; i < n; i++) {
-        node_info node = {
+        htp_opnode node = {
             /*.node =*/gf->nodes[i],
             /*.fused =*/{},
         };
diff --git a/ggml/src/ggml-hexagon/htp-opnode.h b/ggml/src/ggml-hexagon/htp-opnode.h
new file mode 100644
index 00000000000..14b232240b4
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-opnode.h
@@ -0,0 +1,241 @@
+#ifndef HTP_OPNODE_H
+#define HTP_OPNODE_H
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+#include "htp-ops.h"
+
+struct htp_opnode {
+    ggml_tensor * node = nullptr;
+
+    std::vector<ggml_tensor *> fused;
+
+    htp_op_code opcode = HTP_OP_INVALID;
+
+    ggml_op op() const {
+        return node->op;
+    }
+
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+
+    const ggml_tensor * src0() const {
+        return node->src[0];
+    }
+
+    const ggml_tensor * src1() const {
+        return node->src[1];
+    }
+
+    bool is_empty() const {
+        return ggml_op_is_empty(node->op);
+    }
+
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }
+
+    bool stackable() const {
+        switch (this->op()) {
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_MUL_MAT_ID:
+                return ggml_is_quantized(this->src0()->type);
+            default:
+                return false;
+        }
+    }
+
+    bool same_input(const htp_opnode& n) const {
+        return n.src1() == this->src1();
+    }
+
+    std::vector<const ggml_tensor *> get_inputs() const {
+        std::vector<const ggml_tensor *> inputs;
+        std::vector<const ggml_tensor *> outputs;
+        outputs.push_back(node);
+        for (const auto * f : fused) {
+            outputs.push_back(f);
+        }
+
+        auto contains = [&](const std::vector<const ggml_tensor *> & vec, const ggml_tensor * t) {
+            for (const auto * x : vec) {
+                if (x == t) return true;
+            }
+            return false;
+        };
+
+        auto add_input = [&](const ggml_tensor * t) {
+            if (t && !contains(outputs, t) && !contains(inputs, t)) {
+                inputs.push_back(t);
+            }
+        };
+
+        for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) {
+            add_input(node->src[i]);
+        }
+        for (const auto * f : fused) {
+            for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) {
+                add_input(f->src[i]);
+            }
+        }
+        return inputs;
+    }
+
+    std::string op_name() const {
+        if (fused.empty()) {
+            return ggml_op_desc(node);
+        }
+        std::string name = ggml_op_desc(node);
+        for (const auto * f : fused) {
+            name += "+";
+            name += ggml_op_desc(f);
+        }
+        return name;
+    }
+};
+
+struct htp_opformat {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+
+    void format_op_dims(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += format_tensor_dims(p, inputs[0]);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, inputs[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        char self[64];
+        format_tensor_dims(self, node.dst());
+        p += sprintf(p, "%s", self);
+    }
+
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+
+    void format_op_strides(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += format_tensor_strides(p, inputs[0]);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, inputs[i]);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        char self[64];
+        format_tensor_strides(self, node.dst());
+        p += sprintf(p, "%s", self);
+    }
+
+    void format_op_types(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", ggml_type_name(inputs[0]->type));
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(inputs[i]->type));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
+    }
+
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+
+    void format_op_buffs(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", tensor_buff_name(node.dst()));
+    }
+
+    void format_op_names(char * str, const htp_opnode & node) {
+        char * p = str;
+        auto inputs = node.get_inputs();
+
+        if (!inputs.empty()) {
+            p += sprintf(p, "%s", inputs[0]->name);
+
+            for (size_t i = 1; i < inputs.size(); i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", inputs[i]->name);
+            }
+
+            p += sprintf(p, " -> ");
+        }
+
+        p += sprintf(p, "%s", node.dst()->name);
+    }
+
+    void format(const htp_opnode & node) {
+        format_op_dims(dims, node);
+        format_op_strides(strides, node);
+        format_op_types(types, node);
+        format_op_buffs(buffs, node);
+        format_op_names(names, node);
+    }
+
+    htp_opformat() {}
+    htp_opformat(const htp_opnode & node) { format(node); }
+};
+
+#endif // HTP_OPNODE_H
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index d7927261a85..ff3fc0804e3 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -58,15 +58,16 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
 
 if (_hmx_idx GREATER_EQUAL 0)
     target_sources(${HTP_LIB} PRIVATE
-        hmx-queue.c
         hmx-flash-attn-ops.c
         hmx-matmul-ops.c
+        hmx-queue.c
     )
 
     # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
     set_source_files_properties(
         hmx-flash-attn-ops.c
         hmx-matmul-ops.c
+        hmx-queue.c
         PROPERTIES COMPILE_OPTIONS "-mhmx"
     )
 
diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
index d95df6ac9d5..1bd8c1407de 100644
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -22,6 +22,16 @@
 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)
 
+#if __HVX_ARCH__ < 79
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
+#else
+#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
+#define HVX_OP_SUB_F32(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
+#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
+#endif
+
 // This is a bit of a hack because the compiler is strugling to properly inline
 // the default hvx_vec_f32_to_f16 with output into the local array.
 static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
@@ -54,8 +64,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
         rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
     }
 
-    HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
-    rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)));
+    HVX_Vector rsum = HVX_OP_ADD_F32(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p));
+    rsum = HVX_OP_MUL_F32(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
     hvx_vec_store_u(r, 4, rsum);
 }
 
@@ -105,10 +115,10 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y,
         rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
     }
 
-    HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
-    HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
-    HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)));
-    HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)));
+    HVX_Vector rsum0 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p));
+    HVX_Vector rsum1 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p));
+    HVX_Vector rsum2 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p));
+    HVX_Vector rsum3 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p));
 
     HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } };
     return hvx_vec_reduce_sum_f32x4(rsum0123);
@@ -123,7 +133,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
     const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     const size_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector   sums;  // initialize at j = 0
+    HVX_Vector   sums = Q6_V_vzero();
     const size_t stride_x_4 = stride_x * 4;
     for (uint32_t j = 0; j < VLEN_FP32; j += 4) {
         HVX_Vector     sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe);
@@ -132,8 +142,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
         x += stride_x_4;
     }
 
-    sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums);
-    return Q6_Vsf_equals_Vqf32(sums);
+    return HVX_OP_MUL_F32(hvx_vec_splat_f32(s), sums);
 }
 
 // MAD: y (F32) += x (F16) * s (F16)
@@ -268,11 +277,10 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t *
     uint32_t i = 0;
     #pragma unroll(4)
     for (; i < nvec; ++i) {
-        vdst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs));
+        vdst[i] = HVX_OP_MUL_F32(vsrc[i], vs);
     }
     if (nloe) {
-        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
-        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v));
+        hvx_vec_store_a(&vdst[i], nloe * sizeof(float), HVX_OP_MUL_F32(vsrc[i], vs));
     }
 }
 
@@ -438,25 +446,44 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             // Process in sub-blocks of 32 (VLEN_FP32)
             HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32];
             HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
-            for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
+            for (uint32_t iv = 0; ic < current_block_size; ic += VLEN_FP32, ++iv) {
                 // 1. Compute scores
                 HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale);
 
                 // 2. Softcap
                 if (factx->logit_softcap != 0.0f) {
                     scores = hvx_vec_tanh_f32(scores);
-                    scores = Q6_Vqf32_vmpy_VsfVsf(scores, logit_cap);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+                    scores = HVX_OP_MUL_F32(scores, logit_cap);
                 }
 
                 // 3. Mask
                 if (mask) {
                     const __fp16 * mp = m_base + ic;
                     HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
-                    HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
-                    HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
-                    scores = Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores);
-                    scores = Q6_Vsf_equals_Vqf32(scores);
+
+                    // Multiplying -INFINITY (0xFC00) by a slope in VhfVhf instructions can incorrectly produce NaN on v79.
+                    // Clamp -INFINITY to the max negative fp16 finite value (-65504.0f).
+                    HVX_Vector vinf = Q6_Vh_vsplat_R(0xFC00);
+                    HVX_Vector vmin = Q6_Vh_vsplat_R(0xFBFF);
+                    HVX_VectorPred is_inf = Q6_Q_vcmp_eq_VhVh(m_vals_f16, vinf);
+                    m_vals_f16 = Q6_V_vmux_QVV(is_inf, vmin, m_vals_f16);
+
+                    #if __HVX_ARCH__ >= 79
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_vadd_VsfVsf(add_val, scores);
+                    #else
+                        HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
+                        HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
+                        scores = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores));
+                    #endif
+                }
+
+                // Mask out invalid lanes for leftover handling
+                uint32_t valid_lanes = current_block_size - ic;
+                if (valid_lanes < VLEN_FP32) {
+                    HVX_VectorPred valid_pred = Q6_Q_vsetq_R(valid_lanes * 4); // 4 bytes per fp32 lane
+                    scores = Q6_V_vmux_QVV(valid_pred, scores, hvx_vec_splat_f32(-INFINITY));
                 }
 
                 sb_scores[iv] = scores;
@@ -466,78 +493,55 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             {
                 // 4. Online Softmax Update
                 HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec);
-                HVX_Vector diff_vec  = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec));
+                HVX_Vector diff_vec  = HVX_OP_SUB_F32(M_vec, M_new_vec);
                 HVX_Vector ms_vec    = hvx_vec_exp_f32(diff_vec);
                 M_vec = M_new_vec;
 
                 hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
 
                 HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f);
-                for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) {
+                for (uint32_t ic2 = 0, iv = 0; ic2 < current_block_size; ic2 += VLEN_FP32, ++iv) {
                     HVX_Vector scores = sb_scores[iv];
-                    HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec);
-                    HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
+                    HVX_Vector scores_shifted = HVX_OP_SUB_F32(scores, M_vec);
+                    HVX_Vector P = hvx_vec_exp_f32(scores_shifted);
 
-                    p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P));
+                    p_sum_vec = HVX_OP_ADD_F32(p_sum_vec, P);
 
                     // 5. Accumulate V
                     __fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16];
                     hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0));
 
+                    float __attribute__((aligned(128))) P_arr[VLEN_FP32];
+                    hvx_vec_store_a(P_arr, 128, P);
+
                     for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
-                        const uint32_t  cur_ic = ic2 + j;
-                        const uint8_t * v_ptr  = v_base + cur_ic * factx->size_v_row_padded;
+                        const uint32_t cur_ic = ic2 + j;
+                        if (cur_ic >= current_block_size) {
+                            break;
+                        }
+
+                        if (cur_ic + 1 == current_block_size) {
+                            // Odd leftover, process single row
+                            if (P_arr[j] != 0.0f) {
+                                const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
+                                hvx_mad_f32_f16_aa(VKQ32, v_ptr, (p_arr + j), DV);
+                            }
+                            break;
+                        }
+
+                        // Avoid NaN * 0.0 = NaN for uninitialized V cache rows.
+                        // Check the f32 values to safely avoid strict aliasing violations.
+                        if (P_arr[j] == 0.0f && P_arr[j + 1] == 0.0f) {
+                            continue;
+                        }
+
+                        const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
                         hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV);
                     }
                 }
 
                 p_sum_vec = hvx_vec_reduce_sum_f32(p_sum_vec);
-                S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec));
-            }
-
-            if (ic < current_block_size) {
-                // Sync scalars for leftover/next block if needed
-                float M = hvx_vec_get_f32(M_vec);
-                float S = hvx_vec_get_f32(S_vec);
-
-                // Leftover
-                for (; ic < current_block_size; ++ic) {
-                    float s_val;
-                    const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
-                    hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
-                    if (factx->logit_softcap != 0.0f) {
-                        s_val = factx->logit_softcap * tanhf(s_val);
-                    }
-
-                    if (mask) {
-                        const float m_val = m_base[ic];
-                        s_val += slope * m_val;
-                    }
-
-                    const float Mold = M;
-                    __fp16 vs = 1.0f;
-
-                    if (s_val > M) {
-                        M = s_val;
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
-                        HVX_Vector ms_vec   = hvx_vec_exp_f32(diff_vec);
-                        hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
-
-                        float ms = hvx_vec_get_f32(ms_vec);
-                        S = S * ms + vs;
-                    } else {
-                        HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
-                        vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
-                        S += vs;
-                    }
-
-                    const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
-
-                    hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV);
-                }
-
-                M_vec = hvx_vec_splat_f32(M);
-                S_vec = hvx_vec_splat_f32(S);
+                S_vec = HVX_OP_ADD_F32(HVX_OP_MUL_F32(S_vec, ms_vec), p_sum_vec);
             }
 
             // Issue DMA for next+1 block (if exists)
@@ -599,8 +603,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
         const int i2 = iq2;
         const int i3 = iq3;
 
-        // dst is permuted
-        uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
+        // dst is permuted: [DV, n_heads, n_tokens, n_seq]
+        // head stride is nb[1], token stride is nb[2], batch stride is nb[3]
+        uint8_t * dst_ptr = (uint8_t *) dst->data + i2 * dst->nb[1] + i1 * dst->nb[2] + i3 * dst->nb[3];
 
         if (dst->type == HTP_TYPE_F32) {
             hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
@@ -623,8 +628,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
     }
 
 #ifdef HTP_HAS_HMX
-    // HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV
-    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) {
+    // HMX path: head_dim multiple of 32, F16 KV
+    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) {
         int ret = hmx_flash_attn_ext(octx);
         if (ret == HTP_STATUS_OK) {
             return ret;
diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
index a496f6289ae..f132c08500d 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -1248,9 +1248,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
     if (DK % 32 != 0 || DV % 32 != 0) {
         return HTP_STATUS_NO_SUPPORT;
     }
-    if (neq1 < 32) {
-        return HTP_STATUS_NO_SUPPORT;
-    }
 
     // GQA factor
     const uint32_t n_kv_heads = k->ne[2];
diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index ab5fd73380b..083d125882d 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -16,6 +16,7 @@
 #include "ggml-common.h"
 
 #include "hex-dma.h"
+#include "hex-fastdiv.h"
 #include "worker-pool.h"
 
 #include "hvx-utils.h"
@@ -187,45 +188,44 @@ static int hmx_compute_chunks(size_t   vtcm_total,
 // In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
 // of the same 32 packed bytes.
 static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_32);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
     HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
-    // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
-    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
-    // Use standard vlut16 (not _nomatch) to avoid stale-register NaN.
-    // _nomatch retains the previous destination-register value for colliding
-    // indices, but the C intrinsic doesn't model the implicit read so the
-    // compiler may allocate a register containing garbage/NaN.
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
+    HVX_Vector v0     = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8));
+    HVX_Vector v_hf   = Q6_Vhf_equals_Vh(v0);
 
     return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
 }
 
 // Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
-// full HVX vector width.  One vmemu + one vlut16 replaces 4 separate calls.
+// full HVX vector width.
 // Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
 static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
             const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
-    // Load all 128 packed bytes (4 contiguous 32-byte groups)
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
     HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
 
-    // Shuffle before LUT
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
 
-    // Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_lo = Q6_V_lo_W(vp);  // [group0: 32 fp16 | group1: 32 fp16]
-    HVX_Vector v_hi = Q6_V_hi_W(vp);  // [group2: 32 fp16 | group3: 32 fp16]
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
+
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);
 
-    // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
     HVX_Vector vscale = hvx_vmemu(scales_4);
     HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
     HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
@@ -233,13 +233,12 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
     v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
-    // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
-                        v_hi /* group2 already in [0:63] */ };
+    HVX_Vector_x2 r = { v_lo, v_hi };
     return r;
 }
 
 static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_32);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector v_dm = hvx_vmemu(scale_offset);
@@ -248,9 +247,9 @@ static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32
 
     HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    HVX_Vector v0   = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants));
+    HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
 
     return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets));
 }
@@ -258,16 +257,18 @@ static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32
 static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
             const uint8_t *packed_128, bool upper_nibbles,
             const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) {
+    (void)vlut_cvt;
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
 
-    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants);
+    HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
+    HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
 
-    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
-    HVX_Vector v_lo = Q6_V_lo_W(vp);
-    HVX_Vector v_hi = Q6_V_hi_W(vp);
+    v_lo = Q6_Vhf_equals_Vh(v_lo);
+    v_hi = Q6_Vhf_equals_Vh(v_hi);
 
     HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4);
     HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2);
@@ -287,6 +288,45 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
     return r;
 }
 
+// LUT-based dequantizers for non-linear IQ4_NL format.
+static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_hf = Q6_V_lo_W(vp);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
+}
+
+static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx(
+            const uint8_t *packed_128, bool upper_nibbles,
+            const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
+    HVX_Vector vq = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
+    v_quants = Q6_V_vand_VV(v_quants, mask_h4);
+
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+
+    HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector v_lo = Q6_V_lo_W(vp);
+    HVX_Vector v_hi = Q6_V_hi_W(vp);
+
+    HVX_Vector vscale = hvx_vmemu(scales_4);
+    HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
+    HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
+
+    HVX_Vector_x2 r = { v_lo, v_hi };
+    return r;
+}
+
 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
 static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(const int8_t *quants_32, const __fp16 *scale) {
     HVX_Vector vq       = hvx_vmemu(quants_32);
@@ -374,122 +414,176 @@ static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *
     return r;
 }
 
+typedef struct {
+    __fp16                  *dst;
+    const uint8_t           *src;
+    int                      n_cols;
+    int                      k_block;
+    size_t                   row_stride;
+    int                      weight_type;
+    int                      n_tot_tiles;
+    int                      n_tiles_per_task;
+    int                      n_tasks;
+    int                      n_k_tiles;
+    struct fastdiv_values    n_k_tiles_div;
+} x4x2_dequantize_state_t;
+
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
 // Input:  vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
 // Output: vtcm_dst in tile-major FP16 layout.
-static void dequantize_x4x2_weight_to_fp16_tiles_task(
-        __fp16 *restrict vtcm_dst,
-        const uint8_t *restrict vtcm_src,
-        int n_cols, int k_block,
-        size_t row_stride, int weight_type,
-        int start_tile, int end_tile) {
-
-    const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
-    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_Q4_1 || weight_type == HTP_TYPE_IQ4_NL);
-    const bool is_q4_1 = (weight_type == HTP_TYPE_Q4_1);
-    const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
-
-    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
-                                (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
-                                (weight_type == HTP_TYPE_Q4_1)   ? hvx_vmem(q4_1_to_fp16_lut) :
-                                                                   hvx_vmem(q4_0_to_fp16_lut);
 
-    // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
-    // Each int32 element holds a K-row-pair (2 adjacent fp16 values).  word[i] at offset i*128
-    // maps to K-rows 2i and 2i+1.  Column offset (n*4) added per row.
-    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
-    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);  // 4 bytes = 1 column step
-    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);  // first 16 words (64 bytes)
-
-    unsigned ct = (unsigned)start_tile / n_k_tiles;  // column tile index
-    unsigned kt = (unsigned)start_tile % n_k_tiles;  // K tile index
-    for (unsigned t = start_tile; t < end_tile; ) {
-        if (kt >= n_k_tiles) { kt = 0; ct++; }
-
-        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
-        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
-            unsigned blk_idx      = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
-            bool upper            = (sub_blk_base >= 4);
-            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            unsigned dblk_size    = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
-            unsigned scale_step   = is_q4_1 ? 4 : (int)sizeof(__fp16);
-            unsigned scale_off    = qrow_size + blk_idx * dblk_size
-                                  + sub_blk_base * scale_step;
-
-            __fp16 *tile_bases[4];
-            for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
-
-            HVX_Vector v_off = v_scat_base;
-
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-
-            if (is_q4_1) {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_1_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_1_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step)                      \
+static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(                                                \
+        const x4x2_dequantize_state_t *state,                                                                  \
+        int start_tile, int end_tile) {                                                                        \
+                                                                                                               \
+    const int n_k_tiles = state->n_k_tiles;                                                                    \
+    const int qrow_size = (unsigned)state->k_block / 2;                                                        \
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;                                          \
+    const HVX_Vector vlut_cvt = hvx_vmem(lut_name);                                                            \
+                                                                                                               \
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);                                   \
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);                                                          \
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);                                                          \
+                                                                                                               \
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);                                               \
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);                                 \
+                                                                                                               \
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {                                                  \
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }                                                       \
+                                                                                                               \
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {        \
+            unsigned blk_idx      = ((kt * 32) / QK_Q4_0x4x2);                                                 \
+            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;                                            \
+            bool upper            = (sub_blk_base >= 4);                                                       \
+            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);                                               \
+            unsigned scale_off    = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step);           \
+                                                                                                               \
+            __fp16 *tile_bases[4];                                                                             \
+            for (unsigned g = 0; g < 4; g++) {                                                                 \
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;                                   \
+            }                                                                                                  \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {                                                \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                                                                                                               \
+                HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx(                            \
+                    r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);                       \
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);         \
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);         \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+                                                                                                               \
+            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }                    \
+            t += 4; kt += 4;                                                                                   \
+            continue;                                                                                          \
+        }                                                                                                      \
+                                                                                                               \
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;                                             \
+        {                                                                                                      \
+            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;                                                      \
+            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;                                               \
+            bool upper         = (sub_blk >= 4);                                                               \
+            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;         \
+            unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step);                   \
+                                                                                                               \
+            HVX_Vector v_off = v_scat_base;                                                                    \
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride;                               \
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;                                                     \
+                                                                                                               \
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {                                     \
+                const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride;                  \
+                const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride;                  \
+                                                                                                               \
+                HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx(                                   \
+                    r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);                         \
+                HVX_Vector v1 = (row1 < (unsigned)state->n_cols)                                               \
+                    ? dequantize_x4x2_##helper_prefix##_group_hvx(                                             \
+                        r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)                      \
+                    : Q6_V_vzero();                                                                            \
+                                                                                                               \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+                Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);            \
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);                                                   \
+            }                                                                                                  \
+            (void) *(volatile HVX_Vector *)(tile_base);                                                        \
+        }                                                                                                      \
+        ++t; ++kt;                                                                                             \
+    }                                                                                                          \
+                                                                                                               \
+    if (start_tile < end_tile) {                                                                               \
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);                   \
+    }                                                                                                          \
+}                                                                                                              \
+                                                                                                               \
+static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) {                 \
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;                                          \
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {                     \
+        int start = task_id * state->n_tiles_per_task;                                                         \
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);                             \
+        dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end);                                 \
+    }                                                                                                          \
+}
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+DEFINE_DEQUANTIZE_Q4_TASK(q4_0,   q4_0_to_fp16_lut,   q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
+DEFINE_DEQUANTIZE_Q4_TASK(q4_1,   q4_1_to_fp16_lut,   q4_1, 32, 4)
+DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            } else {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
+static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(
+        const x4x2_dequantize_state_t *state,
+        int start_tile, int end_tile) {
 
-                    HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+    const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut);
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
 
-                    Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
-                    Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            }
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
 
-            for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
-            t += 4; kt += 4;
-            continue;
-        }
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
 
-        // --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
-        if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+        // Batch-4 fast path for MXFP4
+        if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {
             int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
-            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;                 // 0 or 4
+            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;
             bool upper        = (sub_blk_base >= 4);
-            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);                    // 128 contiguous packed bytes
-            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;  // all 8 E8M0 scales
+            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);
+            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
 
             __fp16 * tile_bases[4];
             for (int g = 0; g < 4; g++) {
-                tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
+                tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;
             }
 
             HVX_Vector v_off = v_scat_base;
             for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                 int             row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int             row1 = row0 + 1;
-                const uint8_t * r0   = vtcm_src + row0 * row_stride;
-                const uint8_t * r1   = vtcm_src + row1 * row_stride;
+                const uint8_t * r0   = state->src + row0 * state->row_stride;
+                const uint8_t * r1   = state->src + row1 * state->row_stride;
 
-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
                 HVX_Vector_x4 dv0, dv1;
                 dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                     dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
                 } else {
@@ -510,58 +604,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 (void) *(volatile HVX_Vector *) (tile_bases[g]);
             }
 
-            t += 4;
+            t += 4; kt += 4;
             continue;
         }
 
-        // --- Single-tile fallback ---
-        __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
-
-        if (is_q4) {
-            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;
-            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
-            bool upper         = (sub_blk >= 4);
-            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            unsigned dblk_size = is_q4_1 ? 32 : HMX_X4X2_DBLK_SIZE;
-            unsigned scale_step = is_q4_1 ? 4 : (int)sizeof(__fp16);
-            unsigned scale_off = qrow_size + blk_idx * dblk_size + sub_blk * scale_step;
-
-            HVX_Vector v_off = v_scat_base;  // reset to column 0
-            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
-            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
-            if (is_q4_1) {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector v0 = dequantize_x4x2_q4_1_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector v1 = (row1 < n_cols)
-                        ? dequantize_x4x2_q4_1_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                        : Q6_V_vzero();
-
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            } else {
-                for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
-                    const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
-                    const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
-
-                    HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
-                    HVX_Vector v1 = (row1 < n_cols)
-                        ? dequantize_x4x2_q4_0_group_hvx(r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
-                        : Q6_V_vzero();
-
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                    Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
-                    v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                }
-            }
-            (void) *(volatile HVX_Vector *)(tile_base);
-        } else if (weight_type == HTP_TYPE_MXFP4) {
+        // Single-tile fallback
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
             int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
             int  sub_blk      = ((kt * 32) % QK_MXFP4x4x2) / 32;
             bool upper        = (sub_blk >= 4);
@@ -573,15 +622,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int row1 = row0 + 1;
 
-                const uint8_t * r0 = vtcm_src + row0 * row_stride;
-                const uint8_t * r1 = vtcm_src + row1 * row_stride;
+                const uint8_t * r0 = state->src + row0 * state->row_stride;
+                const uint8_t * r1 = state->src + row1 * state->row_stride;
 
-                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
                 mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
 
                 HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
                 HVX_Vector v1;
-                if (row1 < n_cols) {
+                if (row1 < state->n_cols) {
                     mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
                     v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
                 } else {
@@ -594,23 +642,59 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
             (void) *(volatile HVX_Vector *) (tile_base);
-        } else {
-            // Q8_0
+        }
+        ++t; ++kt;
+    }
+
+    if (start_tile < end_tile) {
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+    }
+}
+
+static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
+    x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
+    for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
+        int start = task_id * state->n_tiles_per_task;
+        int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
+        dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
+    }
+}
+
+static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
+        const x4x2_dequantize_state_t *state,
+        int start_tile, int end_tile) {
+
+    const int n_k_tiles = state->n_k_tiles;
+    const int qrow_size = state->k_block;
+    const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
+
+    const HVX_Vector v_scat_base  = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector v_scat_step  = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
+
+    unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
+    unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
+
+    for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
+        if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
+
+        __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
+        {
             int blk_idx  = (kt * 32) / QK_Q8_0x4x2;
             int sub_blk  = ((kt * 32) % QK_Q8_0x4x2) / 32;
             int byte_off  = blk_idx * QK_Q8_0x4x2 + sub_blk * 32;
             int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
 
-            HVX_Vector v_off = v_scat_base;  // reset to column 0
+            HVX_Vector v_off = v_scat_base;
             for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
                 int row0 = ct * HMX_FP16_TILE_N_COLS + r;
                 int row1 = row0 + 1;
 
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
+                const uint8_t *r0 = state->src + row0 * state->row_stride;
+                const uint8_t *r1 = state->src + row1 * state->row_stride;
 
                 HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
-                HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
+                HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
 
                 Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
@@ -622,50 +706,31 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
         ++t; ++kt;
     }
 
-    // Drain HVX scatter write buffer: a vmem load on the same HW thread retires
-    // all pending scatter entries to VTCM.  Without this, the main thread's HMX
-    // reads may see stale data because atomic_fetch_sub (release) only orders
-    // regular stores, not the HVX scatter buffer.
     if (start_tile < end_tile) {
-        (void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
+        (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
     }
 }
 
-typedef struct {
-    __fp16        *dst;
-    const uint8_t *src;
-    int            n_cols;
-    int            k_block;
-    size_t         row_stride;
-    int            weight_type;
-    int            n_tot_tiles;
-    int            n_tiles_per_task;
-    int            n_tasks;
-} x4x2_dequantize_state_t;
-
-static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) {
+static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
     x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
-
     for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
         int start = task_id * state->n_tiles_per_task;
         int end   = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
-
-        dequantize_x4x2_weight_to_fp16_tiles_task(
-            state->dst, state->src, state->n_cols, state->k_block,
-            state->row_stride, state->weight_type, start, end);
+        dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
     }
 }
 
 static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
         struct htp_context *ctx, __fp16 *vtcm_dst,
         const void *vtcm_src, int n_cols, int k_block,
-        size_t row_stride, int weight_type) {
+        size_t row_stride, int weight_type,
+        int n_k_tiles, struct fastdiv_values n_k_tiles_div,
+        worker_callback_t dequant_worker_fn) {
 
     assert(n_cols  % HMX_FP16_TILE_N_COLS == 0);
     assert(k_block % HMX_FP16_TILE_N_COLS == 0);
 
     size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
-    size_t n_k_tiles   = k_block / HMX_FP16_TILE_N_COLS;
     size_t n_tot_tiles = n_col_tiles * n_k_tiles;
 
     size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
@@ -680,8 +745,10 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
     state.k_block          = k_block;
     state.row_stride       = row_stride;
     state.weight_type      = weight_type;
+    state.n_k_tiles        = n_k_tiles;
+    state.n_k_tiles_div    = n_k_tiles_div;
 
-    worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads);
+    worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, ctx->n_threads);
 }
 
 // --- End x4x2 dequantizers ---
@@ -978,6 +1045,20 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
         return -1;
     }
 
+    worker_callback_t dequant_worker_fn = NULL;
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break;
+        case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break;
+        case HTP_TYPE_Q4_1:   dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break;
+        case HTP_TYPE_MXFP4:  dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break;
+        case HTP_TYPE_Q8_0:   dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break;
+        default:
+            return -1;
+    }
+
+    const int n_k_tiles = k / HMX_FP16_TILE_N_COLS;
+    const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles);
+
     // --- Dynamic VTCM layout ---
     const size_t vec_dot_size = k * sizeof(__fp16);
     const size_t vtcm_budget  = ctx->vtcm_size;
@@ -1070,7 +1151,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
         {
             // B0: wait for DMA, dequant weight chunk 0
             dma_queue_pop(ctx->dma[0]);
-            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
+            dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
 
             // A1: issue DMA for weight chunk 1
             const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
@@ -1089,7 +1170,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
             // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
             if (1 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
             }
         }
 
@@ -1131,7 +1212,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
             // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
             if (i + 2 < n_chunk_cnt) {
                 dma_queue_pop(ctx->dma[0]);
-                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
+                dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
             }
         }
     }
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index aadc77235ba..fa85bf4ca0c 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -58,6 +58,7 @@ enum htp_op_code {
     HTP_OP_MUL_MAT,
     HTP_OP_MUL_MAT_ID,
     HTP_OP_RMS_NORM,
+    HTP_OP_RMS_NORM_MUL,
     HTP_OP_UNARY_SILU,
     HTP_OP_UNARY_GELU,
     HTP_OP_UNARY_SIGMOID,
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 7dd90ac7d7f..623008be4e2 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -537,6 +537,7 @@ static int execute_op(struct htp_ops_context * octx) {
 
         case HTP_OP_NORM:
         case HTP_OP_RMS_NORM:
+        case HTP_OP_RMS_NORM_MUL:
         case HTP_OP_SCALE:
         case HTP_OP_SQR:
         case HTP_OP_SQRT:
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index 7d0431d8ba8..770a6673211 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -23,21 +23,26 @@ struct htp_unary_context {
 
     // Precomputed values
     const uint8_t *           data_src0;
+    const uint8_t *           data_src1;            // weight/scale tensor for RMS_NORM_MUL
     uint8_t *                 data_dst;
 
     size_t                    src0_data_row_size;   // actual data bytes per row
+    size_t                    src1_data_row_size;
     size_t                    dst_data_row_size;    // actual data bytes per row
 
     size_t                    src0_row_size_aligned;
+    size_t                    src1_row_size_aligned;
     size_t                    dst_row_size_aligned;
 
     size_t                    src0_spad_half_size;
+    size_t                    src1_spad_half_size;
     size_t                    dst_spad_half_size;
 
     uint32_t                  block;
     uint32_t                  src0_nrows;
     uint32_t                  src0_nrows_per_thread;
     uint32_t                  nc;
+    bool                      broadcast_weight;
 };
 
 // Convert flat row index to DDR byte offset using the tensor's actual strides.
@@ -158,6 +163,71 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     }
 }
 
+static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src,
+                                      const uint8_t * restrict weight,
+                                      uint8_t * restrict dst,
+                                      const int num_elems,
+                                      float     epsilon) {
+    const HVX_Vector * restrict v_src    = (const HVX_Vector *) src;
+    const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight;
+    HVX_Vector * restrict v_dst          = (HVX_Vector *) dst;
+
+    const int nvec = num_elems / VLEN_FP32;    // number of full vectors
+    const int nloe = num_elems % VLEN_FP32;    // leftover elements
+
+    // Compute sum of squares for full vectors
+    HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
+    }
+
+    // Reduce HVX sum
+    sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
+
+    HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
+    HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
+    HVX_Vector mean_v         = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
+    HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
+
+    // Scale and multiply
+    HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
+
+    #pragma unroll(4)
+    for (int i = 0; i < nvec; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
+        HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
+        HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]);
+        v_dst[i] = Q6_Vsf_equals_Vqf32(result);
+    }
+
+    // Handle tail elements using vectorized ops with masking
+    if (nloe > 0) {
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
+        HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
+        HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2);
+        HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]);
+        HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result);
+
+        // Store with masking to avoid overwriting memory beyond the tensor
+        hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v);
+    }
+}
+
 static void hvx_fast_norm_f32(const uint8_t * restrict src,
                                   uint8_t * restrict dst,
                                   uint8_t * restrict pad,
@@ -269,6 +339,27 @@ static void rms_norm_f32(const float * restrict src,
     }
 }
 
+static void rms_norm_mul_f32(const float * restrict src,
+                             const float * restrict weight,
+                             float * restrict dst,
+                             const uint32_t num_rows,
+                             const uint32_t row_elems,
+                             const size_t   row_size,
+                             const size_t   weight_row_size,
+                             int32_t *      op_params,
+                             bool           broadcast_weight) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        const uint8_t * restrict w_local   = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
+
+        hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon);
+    }
+}
+
 static void norm_f32(const float * restrict src,
                          float * restrict dst,
                          uint8_t * restrict spad,
@@ -598,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
     t1 = HAP_perf_get_qtimer_count();
 
     const uint8_t * restrict data_src = uctx->data_src0;
+    const uint8_t * restrict data_src1 = uctx->data_src1;
     uint8_t * restrict       data_dst = uctx->data_dst;
 
     uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread);
     uint8_t * dst_spad_data  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
 
     size_t src0_spad_half_size = uctx->src0_spad_half_size;
+    size_t src1_spad_half_size = uctx->src1_spad_half_size;
     size_t dst_spad_half_size  = uctx->dst_spad_half_size;
 
     // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride
@@ -624,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
     dma_queue * dma_queue = octx->ctx->dma[ith];
 
+    // If weight is broadcasted, load it once per thread at the beginning of execution
+    if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) {
+        dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1);
+        dma_queue_flush(dma_queue);
+    }
+
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) {
         const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
 
@@ -636,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
         dma_queue_push(dma_queue,
             dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off),
             src0_row_size_aligned, nb01, src0_data_row_size, block_size);
+
+        if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+            const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03);
+            dma_queue_push(dma_queue,
+                dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off),
+                uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size);
+        }
+
         ir += block_size;
     }
 
@@ -644,6 +752,10 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
         float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
         float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+        float * src1_spad = NULL;
+        if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+            src1_spad = (float *) dma_queue_pop(dma_queue).dst;
+        }
 
         // Process block in VTCM
         switch (htp_op) {
@@ -653,6 +765,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
             case HTP_OP_RMS_NORM:
                 rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
+            case HTP_OP_RMS_NORM_MUL:
+                {
+                    const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad;
+                    rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight);
+                }
+                break;
             case HTP_OP_SCALE:
                 scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
                 break;
@@ -700,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
             if (pref_ir < src0_end_row) {
                 const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1);
                 const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
-            dma_queue_push(dma_queue,
-                dma_make_ptr(src0_spad, data_src + src0_pref_off),
-                src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
+                dma_queue_push(dma_queue,
+                    dma_make_ptr(src0_spad, data_src + src0_pref_off),
+                    src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size);
+
+                if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) {
+                    const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03);
+                    dma_queue_push(dma_queue,
+                        dma_make_ptr(src1_spad, data_src1 + src1_pref_off),
+                        uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size);
+                }
             }
         }
         ir += block_size;
@@ -732,6 +857,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
         case HTP_OP_RMS_NORM:
             op_type = "rmsnorm-f32";
             break;
+        case HTP_OP_RMS_NORM_MUL:
+            op_type = "rmsnorm-mul-f32";
+            break;
         case HTP_OP_SCALE:
             op_type = "scale-f32";
             break;
@@ -777,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN);
     const size_t dst_row_size_aligned  = hex_round_up(dst_data_row_size,  VLEN);
 
+    size_t src1_data_row_size = 0;
+    size_t src1_row_size_aligned = 0;
+    bool broadcast_weight = false;
+    const struct htp_tensor * src1 = NULL;
+
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        src1 = octx->src[1];
+        src1_data_row_size = src1->ne[0] * sizeof(float);
+        src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN);
+        broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1);
+    }
+
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
     // Double buffering requires 2x size per buffer
 
-    size_t spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
-    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
+    size_t spad_size_per_row = 0;
+    size_t vtcm_row_per_thread = 0;
+
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        if (broadcast_weight) {
+            size_t available_vtcm = octx->ctx->vtcm_size;
+            size_t src1_spad_total = n_threads * src1_row_size_aligned;
+            if (available_vtcm > src1_spad_total) {
+                available_vtcm -= src1_spad_total;
+            } else {
+                available_vtcm = 0;
+            }
+            spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+            vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row);
+        } else {
+            spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned);
+            vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row);
+        }
+    } else {
+        spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+        vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
+    }
 
     // Make sure the reserved vtcm size is sufficient
     if (vtcm_row_per_thread == 0) {
@@ -797,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
     octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
 
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        if (broadcast_weight) {
+            octx->src1_spad.size_per_thread = src1_row_size_aligned;
+        } else {
+            octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2;
+        }
+        octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
+    } else {
+        octx->src1_spad.size = 0;
+        octx->src1_spad.size_per_thread = 0;
+    }
+
     octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+    if (octx->op == HTP_OP_RMS_NORM_MUL) {
+        octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+        octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    } else {
+        octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+    }
 
     FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
          src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
@@ -811,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
             .src0_nrows            = src0_nrows,
 
             .data_src0             = (const uint8_t *)src0->data,
+            .data_src1             = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL,
             .data_dst              = (uint8_t *)dst->data,
 
             .src0_data_row_size    = src0_data_row_size,
+            .src1_data_row_size    = src1_data_row_size,
             .dst_data_row_size     = dst_data_row_size,
 
             .src0_row_size_aligned = src0_row_size_aligned,
+            .src1_row_size_aligned = src1_row_size_aligned,
             .dst_row_size_aligned  = dst_row_size_aligned,
 
             .src0_spad_half_size   = octx->src0_spad.size_per_thread / 2,
+            .src1_spad_half_size   = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0,
             .dst_spad_half_size    = octx->dst_spad.size_per_thread / 2,
 
             .block                 = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
             .nc                    = src0->ne[0],
+            .broadcast_weight      = broadcast_weight,
         };
 
         worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads);
diff --git a/ggml/src/ggml-hexagon/op-desc.h b/ggml/src/ggml-hexagon/op-desc.h
deleted file mode 100644
index a1e8ddd8b97..00000000000
--- a/ggml/src/ggml-hexagon/op-desc.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef OP_DESC_H
-#define OP_DESC_H
-
-#define GGML_COMMON_IMPL_CPP
-#include "ggml-backend-impl.h"
-#include "ggml-common.h"
-
-#include <string>
-#include <stdio.h>
-
-struct op_desc {
-    char strides[64 * GGML_MAX_SRC];
-    char dims[64 * GGML_MAX_SRC];
-    char types[16 * GGML_MAX_SRC];
-    char buffs[64 * GGML_MAX_SRC];
-    char names[64 * GGML_MAX_SRC];
-
-    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
-        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
-        } else {
-            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
-        }
-    }
-
-    void format_op_dims(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += format_tensor_dims(p, t->src[0]);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_dims(p, t->src[i]);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        // format self dims separately for better visual alignment
-        char self[64];
-        format_tensor_dims(self, t);
-
-        p += sprintf(p, "%s", self);
-    }
-
-    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
-        const char * c = ggml_is_contiguous(t) ? "" : "!";
-
-        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
-        } else {
-            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
-        }
-    }
-
-    void format_op_strides(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += format_tensor_strides(p, t->src[0]);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_strides(p, t->src[i]);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        // format self dims separately for better visual alignment
-        char self[64];
-        format_tensor_strides(self, t);
-
-        p += sprintf(p, "%s", self);
-    }
-
-    void format_op_types(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", ggml_type_name(t->type));
-    }
-
-    const char * tensor_buff_name(const struct ggml_tensor * t) {
-        if (t->buffer) {
-            return ggml_backend_buffer_name(t->buffer);
-        }
-        return "NONE";
-    }
-
-    void format_op_buffs(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", tensor_buff_name(t));
-    }
-
-    void format_op_names(char * str, const struct ggml_tensor * t) {
-        char * p = str;
-
-        // append src0 and src1 (if any)
-        if (t->src[0]) {
-            p += sprintf(p, "%s", t->src[0]->name);
-
-            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", t->src[i]->name);
-            }
-
-            p += sprintf(p, " -> ");
-        }
-
-        p += sprintf(p, "%s", t->name);
-    }
-
-    void format(const ggml_tensor * op) {
-        format_op_dims(dims, op);
-        format_op_strides(strides, op);
-        format_op_types(types, op);
-        format_op_buffs(buffs, op);
-        format_op_names(names, op);
-    }
-
-    op_desc() {}
-    op_desc(const ggml_tensor * op) { format(op); }
-};
-
-#endif // OP_DESC_H
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index ba006d9b31a..5d4b10d34b9 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1732,6 +1732,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_metal_library_t lib, const ggml_tensor * op) {
     assert(op->op == GGML_OP_IM2COL);
 
+    GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
+
     GGML_ASSERT(ggml_is_contiguous(op->src[1]));
     GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
     GGML_ASSERT(op->type         == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
@@ -1739,7 +1741,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta
     char base[256];
     char name[256];
 
-    snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
+    if (ne00*ne01 <= 1024) {
+        snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
+    } else {
+        snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
+    }
     snprintf(name, 256, "%s", base);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 206af227a2c..e2ce56e9e28 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3635,16 +3635,26 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
 
     auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
 
-    GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    if (KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
 
-    const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_set_pipeline(enc, pipeline);
-    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
-    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+        ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
+    } else {
+        const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
+        const int64_t  quotient  = N / n_threads + (N % n_threads > 0 ? 1 : 0);
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline);
+        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
+        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
+        ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
+    }
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index e772664ba91..4adf4614acb 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4696,59 +4696,59 @@ kernel void kernel_im2col(
 template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
 template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
 
-// TODO: obsolete -- remove
-//typedef void (im2col_ext_t)(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]);
-//
-//template <typename T>
-//kernel void kernel_im2col_ext(
-//        constant ggml_metal_kargs_im2col & args,
-//        device const float * x,
-//        device        char * dst,
-//        uint3 tgpig[[threadgroup_position_in_grid]],
-//        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
-//        uint3 tpitg[[thread_position_in_threadgroup]],
-//        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
-//    const int64_t KHW = (int64_t)args.KHW;
-//
-//    const int64_t d   = tgpig[0] / args.CHW;
-//    const int64_t chw = tgpig[0] % args.CHW;
-//    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
-//    const int64_t HW = tgpig[0] % KHW;
-//
-//    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
-//    if (tpitg_0 >= args.N) {
-//        return;
-//    }
-//
-//    const int64_t tpitg_1 = HW / args.KW;
-//    const int64_t tpitg_2 = HW % args.KW;
-//
-//    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
-//    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
-//
-//    const int64_t offset_dst =
-//        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
-//        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
-//
-//    device T * pdst = (device T *) (dst);
-//
-//    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
-//        pdst[offset_dst] = 0.0f;
-//    } else {
-//        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
-//        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
-//    }
-//}
-//
-//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
-//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
+// TODO: optimize
+typedef void (im2col_ext_t)(
+        constant ggml_metal_kargs_im2col & args,
+        device const float * x,
+        device        char * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]);
+
+template <typename T>
+kernel void kernel_im2col_ext(
+        constant ggml_metal_kargs_im2col & args,
+        device const float * x,
+        device        char * dst,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3  tgpg[[threadgroups_per_grid]],      // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {  // [M, 1, 1]
+    const int64_t KHW = (int64_t)args.KHW;
+
+    const int64_t d   = tgpig[0] / args.CHW;
+    const int64_t chw = tgpig[0] % args.CHW;
+    const int64_t tgpig_0 = chw / KHW;  // 0 ~ (IC - 1)
+    const int64_t HW = tgpig[0] % KHW;
+
+    const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
+    if (tpitg_0 >= args.N) {
+        return;
+    }
+
+    const int64_t tpitg_1 = HW / args.KW;
+    const int64_t tpitg_2 = HW % args.KW;
+
+    const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
+    const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
+
+    const int64_t offset_dst =
+        (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
+        (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
+
+    device T * pdst = (device T *) (dst);
+
+    if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
+        pdst[offset_dst] = 0.0f;
+    } else {
+        const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
+        pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
+    }
+}
+
+template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
+template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
 
 template <typename TK>
 kernel void kernel_conv_2d(
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 6d6c3e8973d..751ec6116c0 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -379,6 +379,8 @@ struct ggml_backend_opencl_device_context {
     GPU_FAMILY     gpu_family = GPU_FAMILY::UNKNOWN;
     ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN;
 
+    std::regex *opfilter = nullptr; // regex of ops to not claim
+    std::string opfilter_str; // regex string for opfilter
     size_t global_mem_size = 0;
 };
 
@@ -415,8 +417,6 @@ struct ggml_backend_opencl_context {
     bool has_qcom_subgroup_shuffle = false;     // cl_qcom_subgroup_shuffle
     bool disable_fusion;
 
-    std::regex *opfilter = nullptr; // regex of ops to not claim
-
     bool adreno_has_large_buffer;
     bool adreno_use_large_buffer;
     ggml_cl_compiler_version adreno_cl_compiler_version;
@@ -428,6 +428,8 @@ struct ggml_backend_opencl_context {
     size_t  image2d_max_width;
     size_t  image2d_max_height;
 
+    cl_device_svm_capabilities svm_caps;
+
     cl_context context;
     cl_command_queue queue;
 
@@ -3731,6 +3733,68 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
     return found_devices;
 }
 
+static void ggml_opencl_print_backend_info(ggml_backend_opencl_device_context * dev_ctx) {
+    GGML_ASSERT(dev_ctx);
+    GGML_ASSERT(dev_ctx->backend_ctx);
+
+    auto * backend_ctx = dev_ctx->backend_ctx;
+
+    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n",
+        backend_ctx->driver_version.c_str());
+    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
+        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n",
+        backend_ctx->fp16_support ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n",
+        backend_ctx->alignment);
+    GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n",
+        backend_ctx->global_mem_size/1024/1024);
+    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n",
+        backend_ctx->max_alloc_size/1024/1024);
+    GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n",
+        backend_ctx->image_max_buffer_size);
+    GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n",
+        backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
+    GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n",
+        backend_ctx->max_workgroup_size);
+    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
+        backend_ctx->svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+    GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
+        backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
+
+    // Print out configurations
+#ifdef GGML_OPENCL_SOA_Q
+    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
+#endif // GGML_OPENCL_SOA_Q
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
+    if (backend_ctx->adreno_xmem_gemm_enabled) {
+        GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM enabled (temporary weight prepack)\n");
+    }
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+    if (backend_ctx->adreno_use_large_buffer) {
+        if (!backend_ctx->adreno_has_large_buffer) {
+            GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
+            backend_ctx->adreno_use_large_buffer = false;
+        } else {
+            GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
+        }
+    }
+
+    if (dev_ctx->opfilter) {
+        // for information only, the actual regex object is created in ggml_opencl_is_device_supported
+        GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", dev_ctx->opfilter_str.c_str());
+    }
+}
+
 // check if device should be accepted
 static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
     GGML_ASSERT(dev);
@@ -3799,6 +3863,13 @@ static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) {
     }
 
     clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL);
+
+    const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER");
+    if (str_opfilter) {
+        dev_ctx->opfilter_str = str_opfilter;
+        dev_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase);
+    }
+
     return true;
 }
 
@@ -3850,15 +3921,12 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     char *driver_version = (char *)alloca(driver_version_str_size + 1);
     clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
     driver_version[driver_version_str_size] = '\0';
-    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
     backend_ctx->driver_version = driver_version;
 
     backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
     backend_ctx->has_vector_subgroup_broadcast =
         (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
         (backend_ctx->adreno_cl_compiler_version.type == DX   && backend_ctx->adreno_cl_compiler_version.major >= 17);
-    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
-        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
 
     size_t ext_str_size;
     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
@@ -3867,18 +3935,12 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
 
     // check support for qcom_subgroup_shuffle
-    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) {
-        GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n");
-        if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
-            backend_ctx->has_qcom_subgroup_shuffle = true;
-        }
+    if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
+        backend_ctx->has_qcom_subgroup_shuffle = true;
     }
-    GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
-        backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
 
     // Check if ext_buffer contains cl_khr_fp16
     backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
 
     // check Adreno large buffer support
     backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
@@ -3887,35 +3949,15 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
     CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
     GGML_ASSERT(base_align_in_bits % 8u == 0);
     backend_ctx->alignment = base_align_in_bits / 8u;
-    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 
     backend_ctx->global_mem_size = dev_ctx->global_mem_size;
-    GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024);
-
-    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
-
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
 
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
-
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
-    GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
-
-    // Check SVM.
-    cl_device_svm_capabilities svm_caps;
-    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
-    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
-        svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
-        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
-        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
-    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
-        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL));
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &backend_ctx->svm_caps, 0));
 
     if (opencl_c_version.major >= 3) {
         // Assume it is not available for 3.0, since it is optional in 3.0.
@@ -3931,36 +3973,15 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
         backend_ctx->non_uniform_workgroups = true;
     }
 
-    // Print out configurations
-#ifdef GGML_OPENCL_SOA_Q
-    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
-#endif // GGML_OPENCL_SOA_Q
-
-#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
-    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
-#endif // GGML_OPENCL_USE_ADRENO_KERNELS
-
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    // determine whether to use Adreno xmem GEMM
     backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
                                              backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
-    if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
-        GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
-                      backend_ctx->adreno_xmem_gemm_enabled ?
-                      "enabled (temporary weight prepack)" : "requested but unsupported by this driver");
-    }
-#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+#endif
 
     // determine whether to use large buffer for Adreno
     backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
                                            backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
-    if (backend_ctx->adreno_use_large_buffer) {
-        if (!backend_ctx->adreno_has_large_buffer) {
-            GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
-            backend_ctx->adreno_use_large_buffer = false;
-        } else {
-            GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
-        }
-    }
 
     cl_int err;
 
@@ -4010,12 +4031,6 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
 
     backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
 
-    const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER");
-    if (str_opfilter) {
-        backend_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase);
-        GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", str_opfilter);
-    }
-
     dev_ctx->backend_ctx = backend_ctx.release();
     return dev_ctx->backend_ctx;
 }
@@ -4825,7 +4840,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
     ggml_backend_opencl_context *        backend_ctx = dev_ctx->backend_ctx;
 
     // reject ops that match the opfilter regex
-    if (backend_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *backend_ctx->opfilter)) {
+    if (dev_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *dev_ctx->opfilter)) {
         return false;
     }
 
@@ -7823,6 +7838,8 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
         /* .context   = */ backend_ctx,
     };
 
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    ggml_opencl_print_backend_info(dev_ctx);
     return backend;
 
     GGML_UNUSED(params);
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c9f906d7930..2a30fb95c61 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -691,6 +691,7 @@ struct vk_device_struct {
     uint32_t coopmat_int_k;
 
     bool coopmat2;
+    bool coopmat2_bf16_support {};
     bool coopmat2_decode_vector;
 
     bool pipeline_executable_properties_support {};
@@ -3139,7 +3140,7 @@ struct vk_fa_tuning_params {
 };
 
 static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type);
-static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc);
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16);
 
 static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) {
 
@@ -3279,6 +3280,13 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
     FaCodePath path = device->coopmat2 ? FA_COOPMAT2 :
                       device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
 
+    if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) {
+        path = FA_COOPMAT1;
+    }
+    if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) {
+        path = FA_SCALAR;
+    }
+
     if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) {
         // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090
         path = FA_SCALAR;
@@ -3288,7 +3296,7 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
         bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) ||
                         (!f32acc && device->coopmat_support_16x16x16_f16acc);
         const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc);
-        bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc);
+        bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type);
 
         if (!shape_ok || !shmem_ok) {
             path = FA_SCALAR;
@@ -3334,8 +3342,8 @@ static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const
 
 static std::vector<uint32_t> get_fa_spec_constants(const vk_fa_pipeline_state& state) {
     const auto fa_block_bytes = [](ggml_type t) -> uint32_t {
-        // decodeBufF32 uses a block of vec4s for a better memory access pattern.
-        return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t);
+        if (t == GGML_TYPE_F32) return 16u;
+        return (uint32_t) ggml_type_size(t);
     };
     return {
         /* 0 WorkGroupSize   */ state.workgroup_size,
@@ -3849,10 +3857,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
         const uint32_t fa_sgs = fa.first.subgroup_size;
         const bool fa_ds = fa.first.subgroup_size == 0;
 
+        const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
         const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type);
         const void * spv_data = nullptr;
         size_t spv_size = 0;
-        if (use_mmq) {
+        const char *name = nullptr;
+        if (bf16_kv) {
+            spv_data = flash_attn_f32_f16_fp32_data;
+            spv_size = flash_attn_f32_f16_fp32_len;
+            name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16";
+        } else if (use_mmq) {
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
             if (device->fp16) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_int8_data;        spv_size = flash_attn_f32_f16_int8_len; }
@@ -3862,6 +3876,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 spv_size = flash_attn_f32_f16_fp32_int8_len;
             }
 #endif
+            name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         } else {
             if (device->fp16) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_data;        spv_size = flash_attn_f32_f16_len; }
@@ -3870,8 +3885,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
                 spv_data = flash_attn_f32_f16_fp32_data;
                 spv_size = flash_attn_f32_f16_fp32_len;
             }
+            name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         }
-        const char *name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16";
         ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
                                 sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
                                 get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3889,11 +3904,25 @@ static void ggml_vk_load_shaders(vk_device& device) {
             const uint32_t fa_sgs = fa.first.subgroup_size;
             const bool fa_ds = fa.first.subgroup_size == 0;
 
+            const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
+
             const void * spv_data;
             size_t spv_size;
-            if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data;        spv_size = flash_attn_f32_f16_cm1_len; }
-            else        { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
-            const char *name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
+            const char *name;
+            if (bf16_kv) {
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                if (!device->coopmat_bf16_support) continue;
+                spv_data = flash_attn_f32_f16_bf16_cm1_data;
+                spv_size = flash_attn_f32_f16_bf16_cm1_len;
+                name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1";
+#else
+                continue;
+#endif
+            } else {
+                if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data;        spv_size = flash_attn_f32_f16_cm1_len; }
+                else        { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; }
+                name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1";
+            }
             ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7,
                                     sizeof(vk_flash_attn_push_constants), {Br, 1, 1},
                                     get_fa_spec_constants(fa.first), aligned ? Bc : 1, true,
@@ -3911,10 +3940,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
             const bool aligned = fa.first.aligned;
             const bool f32acc = fa.first.f32acc;
 
+            const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16;
             const void * spv_data;
             size_t spv_size;
             const char * name;
-            if (aligned) {
+            if (bf16_kv) {
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                if (!device->coopmat2_bf16_support) continue;
+                spv_data = flash_attn_f32_f16_bf16_cm2_data;
+                spv_size = flash_attn_f32_f16_bf16_cm2_len;
+                name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2";
+#else
+                continue;
+#endif
+            } else if (aligned) {
                 if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data;        spv_size = flash_attn_f32_f16_cm2_len;        name = "flash_attn_f32_f16_aligned_f32acc_cm2"; }
                 else        { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; }
             } else {
@@ -5784,46 +5823,72 @@ static vk_device ggml_vk_get_device(size_t idx) {
                      found_fp16_256 = false,
                      found_fp32_128 = false,
                      found_fp32_256 = false;
+                bool found_bf16_128 = false,
+                     found_bf16_256 = false;
                 // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128
                 // with 32x16x16 and 256 with 32x32x16.
                 for (auto &prop : flexible_dimensions) {
                     if (prop.saturatingAccumulation == VK_FALSE &&
-                        prop.scope == VK_SCOPE_WORKGROUP_KHR &&
-                        prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                        prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-
-                        if (prop.workgroupInvocations == 128 &&
-                            prop.MGranularity <= 32 &&
-                            prop.NGranularity <= 16 &&
-                            prop.KGranularity <= 16) {
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-                                found_fp16_128 = true;
+                        prop.scope == VK_SCOPE_WORKGROUP_KHR) {
+
+                        if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                            prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+
+                            if (prop.workgroupInvocations == 128 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 16 &&
+                                prop.KGranularity <= 16) {
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                    found_fp16_128 = true;
+                                }
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                    found_fp32_128 = true;
+                                }
                             }
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
-                                found_fp32_128 = true;
+                            if (prop.workgroupInvocations == 256 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 32 &&
+                                prop.KGranularity <= 16) {
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
+                                    found_fp16_256 = true;
+                                }
+                                if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                                    prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+                                    found_fp32_256 = true;
+                                }
                             }
                         }
-                        if (prop.workgroupInvocations == 256 &&
-                            prop.MGranularity <= 32 &&
-                            prop.NGranularity <= 32 &&
-                            prop.KGranularity <= 16) {
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) {
-                                found_fp16_256 = true;
+
+#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+                        if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
+                            prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR &&
+                            prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
+                            prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
+
+                            if (prop.workgroupInvocations == 128 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 16 &&
+                                prop.KGranularity <= 16) {
+                                found_bf16_128 = true;
                             }
-                            if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR &&
-                                prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) {
-                                found_fp32_256 = true;
+                            if (prop.workgroupInvocations == 256 &&
+                                prop.MGranularity <= 32 &&
+                                prop.NGranularity <= 32 &&
+                                prop.KGranularity <= 16) {
+                                found_bf16_256 = true;
                             }
                         }
+#endif
                     }
                 }
                 if (found_fp16_128 && found_fp16_256 &&
                     found_fp32_128 && found_fp32_256 &&
                     coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
                     device->coopmat2 = true;
+                    device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256;
                     device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
                 }
             }
@@ -9448,7 +9513,8 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
     const uint32_t Br = params.block_rows;
     const uint32_t Bc = params.block_cols;
 
-    const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
+    // BF16 uses the fp32 shader (FLOAT_TYPE=float)
+    const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float);
 
     const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type);
 
@@ -9489,7 +9555,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
     return supported;
 }
 
-static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc) {
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) {
     // Needs to be kept up to date on shader changes
     const uint32_t Br = params.block_rows;
     const uint32_t Bc = params.block_cols;
@@ -9519,8 +9585,10 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
     const uint32_t vsh_stride = MatBc / 4 * row_split;
     const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4;
 
+    // BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes)
+    const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4;
     const uint32_t osh_stride = params.row_split * MatBr / 4;
-    const uint32_t pvsh = MatBc * osh_stride * f16vec4;
+    const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size;
 
     const uint32_t slope = Br * acctype;
 
@@ -9589,7 +9657,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     uint32_t workgroups_y = (uint32_t)neq2;
     uint32_t workgroups_z = (uint32_t)neq3;
 
-    const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;
+    const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16;
 
     // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
     // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
@@ -16400,6 +16468,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                     switch (t) {
                     case GGML_TYPE_F32:
                     case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
                     case GGML_TYPE_Q8_0:
                     case GGML_TYPE_Q5_1:
                     case GGML_TYPE_Q5_0:
@@ -16415,6 +16484,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) {
                     return false;
                 }
+                if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) {
+                    return false;
+                }
                 if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) {
                     // scalar/coopmat1 FA uses subgroupShuffle/subgroupAll
                     return false;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
index 9a7957da97b..66dcf610219 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
@@ -97,8 +97,17 @@ layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];};
 #define FA_TYPE_Q5_0  6u
 #define FA_TYPE_Q5_1  7u
 #define FA_TYPE_Q8_0  8u
+#define FA_TYPE_BF16 30u
 #define FA_TYPE_Q1_0 41u
 
+#if defined(BFLOAT16)
+#define O_TYPE float
+#define O_TYPEV4 vec4
+#else
+#define O_TYPE FLOAT_TYPE
+#define O_TYPEV4 FLOAT_TYPEV4
+#endif
+
 // Number of matrix elements per buffer block, derived from the K/V type spec
 // constant. F32 is treated as a vec4 "block" of 4 floats. F16 uses block size 1
 // and bypasses the dequant path entirely. Quants follow their ggml block sizes.
@@ -111,6 +120,7 @@ uint fa_block_elems(uint ty) {
         case FA_TYPE_Q5_0: return uint(QUANT_K_Q5_0);
         case FA_TYPE_Q5_1: return uint(QUANT_K_Q5_1);
         case FA_TYPE_Q8_0: return uint(QUANT_K_Q8_0);
+        case FA_TYPE_BF16: return 1u;
         case FA_TYPE_Q1_0: return uint(QUANT_K_Q1_0); // cm2-only, harmless elsewhere
         default:           return 1u;
     }
@@ -248,7 +258,7 @@ const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f;
 
 // Store the output when doing grouped query attention.
 // Rows index by Q's dimension 2, and the first N rows are valid.
-void gqaStore(const in uint32_t r, const in uint32_t c, const in FLOAT_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+void gqaStore(const in uint32_t r, const in uint32_t c, const in O_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
 {
     uint32_t offset = (iq2 + r) * HSV / 4 + c;
     data_ov4[o_offset + offset] = D_TYPEV4(elems);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
index bffcc095be3..23ae3833e52 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -6,6 +6,10 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 
+#if defined(BFLOAT16)
+#extension GL_EXT_bfloat16 : enable
+#endif
+
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_KHR_shader_subgroup_vote : enable
@@ -14,7 +18,9 @@
 
 #include "types.glsl"
 #include "flash_attn_base.glsl"
+#if !defined(BFLOAT16)
 #include "flash_attn_dequant.glsl"
+#endif
 
 // These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
 const uint32_t MatBr = 16;
@@ -27,32 +33,32 @@ const uint32_t cols_per_thread = Bc / cols_per_iter;
 
 layout (binding = 0) readonly buffer Q {float data_q[];};
 layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
-layout (binding = 1) readonly buffer K {float16_t data_k[];};
-layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
-layout (binding = 2) readonly buffer V {float16_t data_v[];};
-layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
+layout (binding = 1) readonly buffer K {FLOAT_TYPE data_k[];};
+layout (binding = 1) readonly buffer KV4 {FLOAT_TYPEV4 data_kv4[];};
+layout (binding = 2) readonly buffer V {FLOAT_TYPE data_v[];};
+layout (binding = 2) readonly buffer VV4 {FLOAT_TYPEV4 data_vv4[];};
 layout (binding = 3) readonly buffer M {float16_t data_m[];};
 
 shared float tmpsh[row_split];
 
-const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
-shared f16vec4 Qf[Br * qstride];
+const uint32_t qstride = HSK_pad / 4 + 2;
+shared FLOAT_TYPEV4 Qf[Br * qstride];
 
 const uint psh_stride = Br / 4 + 2;
-shared f16vec4 Psh[Bc * psh_stride];
+shared FLOAT_TYPEV4 Psh[Bc * psh_stride];
 
 // Avoid padding for hsk==256 to make it fit in 48KB shmem.
 const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4;
 shared ACC_TYPEV4 sfsh[Bc * sfshstride];
 
 const uint32_t D_pad = HSK_pad > HSV_pad ? HSK_pad : HSV_pad;
-const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; // in units of f16vec4
+const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2;
 const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups
 const uint vsh_stride = v_cols;
-shared f16vec4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
+shared FLOAT_TYPEV4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)];
 
 const uint32_t osh_stride = row_split * MatBr / 4;
-shared f16vec4 pvsh[MatBc * osh_stride];
+shared O_TYPEV4 pvsh[MatBc * osh_stride];
 
 shared ACC_TYPE slope[Br];
 
@@ -76,7 +82,7 @@ void main() {
     if ((HSK % 16) != 0) {
         [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
             if (i + tid < Br * qstride) {
-                Qf[i + tid] = f16vec4(0);
+                Qf[i + tid] = FLOAT_TYPEV4(0);
             }
         }
         barrier();
@@ -89,15 +95,15 @@ void main() {
         uint32_t r = (idx + tid) / (HSK / 4);
         if (r < Br && d < HSK / 4 &&
             i * Br + r < N) {
-            Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
+            Qf[r * qstride + d] = FLOAT_TYPEV4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
         }
     }
     barrier();
 
-    f16vec4 Of[rows_per_thread][d_per_thread];
+    O_TYPEV4 Of[rows_per_thread][d_per_thread];
     [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
         [[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) {
-            Of[r][d] = f16vec4(0.0);
+            Of[r][d] = O_TYPEV4(0.0);
         }
     }
 
@@ -222,15 +228,18 @@ void main() {
                 uint32_t d = (idx + tid) % (HSK_pad / 4);
                 uint32_t c = (idx + tid) / (HSK_pad / 4);
                 if (idx + gl_WorkGroupSize.x <= Bc * HSK_pad / 4 || c < Bc) {
-                    f16vec4 K_Tf = f16vec4(0);
+                    FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
                     if ((!KV_bounds_check || j * Bc + c < KV) && (HSK == HSK_pad || d < HSK / 4)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_K) {
                             uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d;
                             uint ib = coord / BLOCK_SIZE_K;
                             uint iqs = (coord % BLOCK_SIZE_K);
                             K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
-                        } else {
-                            K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
+                        } else
+#endif
+                        {
+                            K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
                         }
                     }
 
@@ -244,16 +253,16 @@ void main() {
         // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
         // This is written transposed in order to allow for N being 8 if implementations need it
         coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
-        coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
-        coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
+        coopmat<FLOAT_TYPE, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
+        coopmat<FLOAT_TYPE, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
 
         [[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
             // If SHMEM_STAGING is set, a Bc * HSK_pad size tile of K is loaded to shmem
-            // If not, f16 K is loaded directly from global memory if aligned, otherwise
+            // If not, K is loaded directly from global memory if aligned, otherwise
             // staged through a Bc * MatBr size staging buffer.
-            // If K is not type f16, then it is always staged for dequantization.
+            // If K is a quant type, then it is always staged for dequantization.
             if (SHMEM_STAGING == 0) {
-            // For quants we always need to dequant into kvsh; for f16 we can load
+            // For quants we always need to dequant into kvsh; for f16/bf16 we can load
             // directly from global memory when alignment / bounds allow it.
             const bool stage_k = USE_DECODE_K || KV_bounds_check || d * 16 + 16 > HSK;
             if (stage_k) {
@@ -262,15 +271,18 @@ void main() {
                     uint32_t col_vec = (idx + tid) % (MatBr / 4);
                     uint32_t row = (idx + tid) / (MatBr / 4);
                     if (idx + tid < Bc * MatBr / 4) {
-                        f16vec4 K_Tf = f16vec4(0);
+                        FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
                         if ((!KV_bounds_check || j * Bc + row < KV) && (HSK == HSK_pad || d * 16 + col_vec * 4 < HSK)) {
+#if !defined(BFLOAT16)
                             if (USE_DECODE_K) {
                                 uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4;
                                 uint ib = coord / BLOCK_SIZE_K;
                                 uint iqs = (coord % BLOCK_SIZE_K);
                                 K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
-                            } else {
-                                K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
+                            } else
+#endif
+                            {
+                                K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]);
                             }
                         }
 
@@ -357,7 +369,7 @@ void main() {
         [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
             const uint d_local = d0 / threads_per_rowgroup;
             [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-                Of[r][d_local] = float16_t(eMf[r]) * Of[r][d_local];
+                Of[r][d_local] = O_TYPE(eMf[r]) * Of[r][d_local];
             }
         }
 
@@ -368,10 +380,10 @@ void main() {
             [[unroll]] for (uint32_t r = 0; r < rows_per_thread; r += 4) {
                 const uint row = tile_row(r);
                 if (KV_bounds_check && j * Bc + col >= KV) {
-                    Psh[col * psh_stride + row / 4] = f16vec4(0.0f);
+                    Psh[col * psh_stride + row / 4] = FLOAT_TYPEV4(0.0f);
                 } else {
                     const vec4 mfvec = vec4(Mf[r], Mf[r + 1], Mf[r + 2], Mf[r + 3]);
-                    const f16vec4 Pf = f16vec4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
+                    const FLOAT_TYPEV4 Pf = FLOAT_TYPEV4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec));
                     [[unroll]] for (uint32_t vec_idx = 0; vec_idx < 4; ++vec_idx) {
                         Lf[r + vec_idx] += Pf[vec_idx];
                     }
@@ -385,15 +397,18 @@ void main() {
                 uint32_t d = (idx + tid) % (HSV_pad / 4);
                 uint32_t c = (idx + tid) / (HSV_pad / 4);
                 if (idx + gl_WorkGroupSize.x <= Bc * HSV_pad / 4 || c < Bc) {
-                    f16vec4 V_Tf = f16vec4(0);
+                    FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0);
                     if ((!KV_bounds_check || j * Bc + c < KV) && (HSV == HSV_pad || d < HSV / 4)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_V) {
                             uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d;
                             uint ib = coord / BLOCK_SIZE_V;
                             uint iqs = (coord % BLOCK_SIZE_V);
                             V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
-                        } else {
-                            V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
+                        } else
+#endif
+                        {
+                            V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
                         }
                     }
 
@@ -409,7 +424,7 @@ void main() {
         [[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) {
             const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16;
 
-            coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<float16_t, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
+            coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> PVMat = coopmat<O_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
 
             // Preload V tiles for [Bc, 16 * num subgroups]
             const uint v_rows = Bc;
@@ -417,11 +432,11 @@ void main() {
             const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x;
 
             // If SHMEM_STAGING is set, a Bc * HSV_pad size tile of V is loaded to shmem.
-            // If not, f16 V is loaded directly from global memory if aligned, otherwise
+            // If not, V is loaded directly from global memory if aligned, otherwise
             // staged through a Bc * MatBr size staging buffer.
-            // If V is not type f16, then it is always staged for dequantization.
+            // If V is a quant type, then it is always staged for dequantization.
             if (SHMEM_STAGING == 0) {
-            // For quants we always preload via kvsh. For f16 we only preload when
+            // For quants we always preload via kvsh. For f16/bf16 we only preload when
             // alignment / bounds force it (otherwise we coopMatLoad direct from data_vv4).
             const bool stage_v = USE_DECODE_V || KV_bounds_check;
             if (stage_v) {
@@ -438,13 +453,16 @@ void main() {
                     const uint iqs = coord % BLOCK_SIZE_V;
 
                     if (!KV_bounds_check || (v_row < KV && v_col < HSV)) {
+#if !defined(BFLOAT16)
                         if (USE_DECODE_V) {
                             kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
-                        } else {
+                        } else
+#endif
+                        {
                             kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4];
                         }
                     } else {
-                        kvsh[row * vsh_stride + col] = f16vec4(0.0f);
+                        kvsh[row * vsh_stride + col] = FLOAT_TYPEV4(0.0f);
                     }
                 }
             }
@@ -459,7 +477,7 @@ void main() {
 
                     if (SHMEM_STAGING == 0) {
                     if (!USE_DECODE_V && !KV_bounds_check) {
-                        // F16 values can be loaded directly from global memory
+                        // F16/BF16 values can be loaded directly from global memory
                         const uint v_tile_row = j * Bc + bc_chunk * MatBc;
                         const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4;
                         coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -573,7 +591,7 @@ void main() {
 
                 [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
                     const uint d_local = d0 / threads_per_rowgroup;
-                    Of[r][d_local] *= float16_t(ms);
+                    Of[r][d_local] *= O_TYPE(ms);
                 }
             } else {
                 vs = exp(sink - Mf[r]);
@@ -591,7 +609,7 @@ void main() {
     [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) {
         const uint d_local = d0 / threads_per_rowgroup;
         [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
-            Of[r][d_local] *= float16_t(Lfrcp[r]);
+            Of[r][d_local] *= O_TYPE(Lfrcp[r]);
 #if defined(FLOAT_TYPE_MAX)
             Of[r][d_local] = clamp(Of[r][d_local], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX);
 #endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
index 6d45b4931df..b9c03fe499d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -8,6 +8,10 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
+#if defined(BFLOAT16)
+#extension GL_EXT_bfloat16 : enable
+#endif
+
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_NV_cooperative_matrix2 : enable
@@ -21,7 +25,9 @@
 
 #include "types.glsl"
 #include "flash_attn_base.glsl"
+#if !defined(BFLOAT16)
 #include "dequant_funcs_cm2.glsl"
+#endif
 
 // buffer_reference stride = sizeof(struct) = FaBlockBytesK/V.
 layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_K {
@@ -31,6 +37,7 @@ layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_
     uint8_t raw[FaBlockBytesV];
 };
 
+#if !defined(BFLOAT16)
 float16_t faDecodeK(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
     switch (FaTypeK) {
         case FA_TYPE_F32:  return dequantFuncF32 (decodeBufF32 (bl_in), blockCoords, coordInBlock);
@@ -91,6 +98,7 @@ f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], co
 #define FADECODEK , faDecodeK
 #define FADECODEV , faDecodeV
 #endif
+#endif
 
 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
 layout (binding = 1) readonly buffer K {uint8_t data_k[];};
@@ -195,15 +203,15 @@ void main() {
     tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
 
     coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseAccumulator> Q;
-    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
+    coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA> Qf16;
 
     uint32_t q_offset = gqa_iq1*p.nb01*4/*sizeof(float)*/ + iq2*p.nb02+iq3*p.nb03;
     coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad));
 
-    Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
-    Qf16 *= float16_t(p.scale);
+    Q *= Q_TYPE(p.scale);
+    Qf16 = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
 
-    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
+    coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
 
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
 
@@ -291,16 +299,20 @@ void main() {
 
         coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, HSK_pad, Bc, gl_MatrixUseB> K_T;
 
         uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
         // F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
+#if defined(BFLOAT16)
+        coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
+#else
         const bool k_use_decode = (bs_k > 1u);
         if (k_use_decode) {
             coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
         } else {
             coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
         }
+#endif
         S = coopMatMulAdd(Qf16, K_T, S);
 
         if (LOGIT_SOFTCAP) {
@@ -351,22 +363,26 @@ void main() {
             coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
         }
 
-        coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
 
         // compute rowsum by multiplying by matrix of all ones.
-        coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
 
         rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
         rowsum = coopMatMulAdd(P_A, One, rowsum);
 
-        coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
+        coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, Bc, HSV_pad, gl_MatrixUseB> V;
         uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
+#if defined(BFLOAT16)
+        coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
+#else
         const bool v_use_decode = (bs_v > 1u);
         if (v_use_decode) {
             coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
         } else {
             coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
         }
+#endif
 
         L = eM*L + rowsum;
 
@@ -378,7 +394,7 @@ void main() {
         // resize eM by using smear/reduce
         coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
 
-        O *= coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
+        O *= coopmat<O_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
         O = coopMatMulAdd(P_A, V, O);
     }
 
@@ -427,7 +443,7 @@ void main() {
             if (sink > Mr[i]) {
                 ms = exp(Mr[i] - sink);
 
-                O[i] *= float16_t(ms);
+                O[i] *= O_TYPE(ms);
             } else {
                 vs = exp(sink - Mr[i]);
             }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
index 02106f33cbe..8704479d960 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl
@@ -28,6 +28,9 @@ layout (binding = 2) readonly buffer V_PACKED_Q5_1 { block_q5_1_packed16 data[];
 layout (binding = 1) readonly buffer K_PACKED_Q8_0 { block_q8_0_packed16 data[]; } k_packed_q8_0;
 layout (binding = 2) readonly buffer V_PACKED_Q8_0 { block_q8_0_packed16 data[]; } v_packed_q8_0;
 
+layout (binding = 1) readonly buffer K_PACKED_BF16 { u16vec4 data[]; } k_packed_bf16;
+layout (binding = 2) readonly buffer V_PACKED_BF16 { u16vec4 data[]; } v_packed_bf16;
+
 // Q4_1 and Q5_1 packed32 views: aliased to the same memory as the packed16
 // views, used by the MMQ K-side hot path for fast 4-uint loads.
 layout (binding = 1) readonly buffer K_PACKED_Q4_1_P32 { block_q4_1_packed32 data[]; } k_packed_q4_1_p32;
@@ -99,6 +102,9 @@ layout (binding = 1) readonly buffer K_PACKED_Q5_1_P32 { block_q5_1_packed32 dat
     return FLOAT_TYPE(BUF.data[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y);          \
 }
 
+#define FA_DEQUANT4_BF16(BUF) \
+    return FLOAT_TYPEV4(bf16_to_fp32(uvec4(BUF.data[(a_offset + ib) / 4])));
+
 FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
     if (binding_idx == BINDING_IDX_K) {
         switch (FaTypeK) {
@@ -108,6 +114,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
             case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(k_packed_q5_0)
             case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(k_packed_q5_1)
             case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(k_packed_q8_0)
+            case FA_TYPE_BF16: FA_DEQUANT4_BF16(k_packed_bf16)
         }
     } else {
         switch (FaTypeV) {
@@ -117,6 +124,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
             case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(v_packed_q5_0)
             case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(v_packed_q5_1)
             case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(v_packed_q8_0)
+            case FA_TYPE_BF16: FA_DEQUANT4_BF16(v_packed_bf16)
         }
     }
     return FLOAT_TYPEV4(0);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index fa9b938e4f7..de7dbec2c63 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -662,6 +662,28 @@ void process_shaders() {
         }
     }
 
+    const std::map<std::string, std::string> fa_bf16_dict = {
+        {"FLOAT_TYPE",   "bfloat16_t"},
+        {"FLOAT_TYPEV2", "bf16vec2"},
+        {"FLOAT_TYPEV4", "bf16vec4"},
+        {"ACC_TYPE",     "float"},
+        {"ACC_TYPEV2",   "vec2"},
+        {"ACC_TYPEV4",   "vec4"},
+        {"BFLOAT16",     "1"},
+    };
+
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm1.comp",
+        merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}),
+        true, true, false, false);
+#endif
+
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm2.comp",
+        merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}),
+        true, false, true, false);
+#endif
+
     std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}};
 
     for (const auto& tname : type_names) {
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 60e98a60741..f4c5eca0df5 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -84,16 +84,16 @@ struct ggml_webgpu_shader_lib_context {
     ggml_tensor * src5;
     ggml_tensor * dst;
 
-    uint32_t max_wg_size;
-    size_t   wg_mem_limit_bytes       = 0;
-    bool     supports_subgroups       = false;
-    bool     supports_subgroup_matrix = false;
-    uint32_t sg_mat_m                 = 0;
-    uint32_t sg_mat_n                 = 0;
-    uint32_t sg_mat_k                 = 0;
-    uint32_t min_subgroup_size        = 0;
-    uint32_t max_subgroup_size        = 0;
-    bool     supports_dot_product     = false;
+    uint32_t    max_wg_size;
+    size_t      wg_mem_limit_bytes       = 0;
+    bool        supports_subgroups       = false;
+    bool        supports_subgroup_matrix = false;
+    uint32_t    sg_mat_m                 = 0;
+    uint32_t    sg_mat_n                 = 0;
+    uint32_t    sg_mat_k                 = 0;
+    uint32_t    min_subgroup_size        = 0;
+    uint32_t    max_subgroup_size        = 0;
+    bool        supports_dot_product     = false;
     std::string vendor;
 };
 
@@ -166,9 +166,11 @@ struct ggml_webgpu_set_rows_pipeline_key {
     int dst_type;
     int vec4;
     int i64_idx;
+    int pair_blocks;
 
     bool operator==(const ggml_webgpu_set_rows_pipeline_key & other) const {
-        return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx;
+        return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx &&
+               pair_blocks == other.pair_blocks;
     }
 };
 
@@ -178,6 +180,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash {
         ggml_webgpu_hash_combine(seed, key.dst_type);
         ggml_webgpu_hash_combine(seed, key.vec4);
         ggml_webgpu_hash_combine(seed, key.i64_idx);
+        ggml_webgpu_hash_combine(seed, key.pair_blocks);
         return seed;
     }
 };
@@ -185,6 +188,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash {
 struct ggml_webgpu_set_rows_shader_decisions {
     bool     vec4;
     bool     i64_idx;
+    bool     pair_blocks;
     uint32_t wg_size;
 };
 
@@ -772,31 +776,30 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
                                   (v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
     const bool kv_vec_type_supported =
         K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
-    const uint32_t kv_vec_head_align = K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH :
-                                                                  (uint32_t) ggml_blck_size(K->type);
-    const bool kv_vec_head_dims_aligned = context.src0->ne[0] % kv_vec_head_align == 0 &&
-                                          context.src2->ne[0] % kv_vec_head_align == 0;
+    const uint32_t kv_vec_head_align =
+        K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH : (uint32_t) ggml_blck_size(K->type);
+    const bool kv_vec_head_dims_aligned =
+        context.src0->ne[0] % kv_vec_head_align == 0 && context.src2->ne[0] % kv_vec_head_align == 0;
     // Compile with enough invocations to cover the largest reported subgroup.
-    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) &&
-                         kv_vec_head_dims_aligned && kv_vec_type_supported &&
-                         (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
+    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && kv_vec_head_dims_aligned &&
+                         kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
                          (context.src2->type == K->type);
     const bool tile_can_dispatch_all_q_rows =
         context.max_subgroup_size > 0 &&
         context.max_wg_size >= GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE * context.max_subgroup_size;
-    const bool use_subgroup_matrix =
-        context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 &&
-        context.src0->ne[0] % context.sg_mat_k == 0 && context.src2->ne[0] % context.sg_mat_n == 0;
+    const bool use_subgroup_matrix = context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 &&
+                                     context.src0->ne[0] % context.sg_mat_k == 0 &&
+                                     context.src2->ne[0] % context.sg_mat_n == 0;
     const bool use_tile = context.supports_subgroups && !use_subgroup_matrix && K->type == GGML_TYPE_F16 &&
                           V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
                           (context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
                           (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
                           tile_can_dispatch_all_q_rows && !use_vec;
 
-    decisions.path = use_vec                          ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
-                     use_tile                         ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
-                     use_subgroup_matrix              ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
-                                                        GGML_WEBGPU_FLASH_ATTN_PATH_NONE;
+    decisions.path = use_vec             ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
+                     use_tile            ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
+                     use_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
+                                           GGML_WEBGPU_FLASH_ATTN_PATH_NONE;
 
     if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_NONE) {
         return decisions;
@@ -1131,9 +1134,9 @@ class ggml_webgpu_shader_lib {
                        ggml_webgpu_flash_attn_blk_pipeline_key_hash>
         flash_attn_blk_pipelines;
     std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
-        mul_mat_vec_pipelines;     // fast mat-vec (n==1)
+        mul_mat_vec_pipelines;   // fast mat-vec (n==1)
     std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
-                                             mul_mat_fast_pipelines;       // fast mat-mat (reg-tile or subgroup)
+        mul_mat_fast_pipelines;  // fast mat-mat (reg-tile or subgroup)
     std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
                                              quantize_q8_pipelines;
     std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines;  // key is fixed
@@ -1264,10 +1267,13 @@ class ggml_webgpu_shader_lib {
     }
 
     webgpu_pipeline get_set_rows_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        ggml_webgpu_set_rows_pipeline_key key = {};
-        key.dst_type                          = context.dst->type;
-        key.vec4                              = context.src0->ne[0] % 4 == 0;
-        key.i64_idx                           = context.src1->type == GGML_TYPE_I64;
+        const bool                        quantized = ggml_is_quantized(context.dst->type);
+        ggml_webgpu_set_rows_pipeline_key key       = {};
+        key.dst_type                                = context.dst->type;
+        key.vec4 =
+            (context.dst->type == GGML_TYPE_F32 || context.dst->type == GGML_TYPE_F16) && context.src0->ne[0] % 4 == 0;
+        key.i64_idx     = context.src1->type == GGML_TYPE_I64;
+        key.pair_blocks = quantized && ((context.src0->ne[0] / ggml_blck_size(context.dst->type)) % 2 == 0);
 
         auto it = set_rows_pipelines.find(key);
         if (it != set_rows_pipelines.end()) {
@@ -1286,6 +1292,14 @@ class ggml_webgpu_shader_lib {
                 defines.push_back("DST_F16");
                 variant += "_dstf16";
                 break;
+            case GGML_TYPE_Q8_0:
+                defines.push_back("DST_Q8_0");
+                variant += "_dstq8_0";
+                break;
+            case GGML_TYPE_Q4_0:
+                defines.push_back("DST_Q4_0");
+                variant += "_dstq4_0";
+                break;
             default:
                 GGML_ABORT("Unsupported dst type for set_rows shader");
         }
@@ -1298,13 +1312,19 @@ class ggml_webgpu_shader_lib {
             defines.push_back("I64_IDX");
             variant += "_i64idx";
         }
+        if (key.pair_blocks) {
+            defines.push_back("PAIR_BLOCKS");
+            variant += "_pair_blocks";
+        }
 
         defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
 
-        auto processed                  = preprocessor.preprocess(wgsl_set_rows, defines);
-        auto decisions                  = std::make_shared<ggml_webgpu_set_rows_shader_decisions>();
+        const auto & shader_source      = quantized ? wgsl_set_rows_quant : wgsl_set_rows;
+        auto         processed          = preprocessor.preprocess(shader_source, defines);
+        auto         decisions          = std::make_shared<ggml_webgpu_set_rows_shader_decisions>();
         decisions->vec4                 = key.vec4;
         decisions->i64_idx              = key.i64_idx;
+        decisions->pair_blocks          = key.pair_blocks;
         decisions->wg_size              = context.max_wg_size;
         set_rows_pipelines[key]         = ggml_webgpu_create_pipeline(device, processed, variant);
         set_rows_pipelines[key].context = decisions;
@@ -1660,7 +1680,7 @@ class ggml_webgpu_shader_lib {
         key.type                              = context.dst->type;
         key.d_state                           = (int) context.src0->ne[0];
         key.xbc_overlap                       = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
-                                                ggml_webgpu_tensor_overlap(context.src1, context.src5);
+                          ggml_webgpu_tensor_overlap(context.src1, context.src5);
 
         auto it = ssm_scan_pipelines.find(key);
         if (it != ssm_scan_pipelines.end()) {
@@ -1819,7 +1839,7 @@ class ggml_webgpu_shader_lib {
                           (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
                                                        1 :
                                                        0;
-        key.use_mmvq                             =
+        key.use_mmvq =
             ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
 
         auto it = mul_mat_vec_pipelines.find(key);
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1846886db4e..d577b5afa3c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1331,7 +1331,11 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context & ct
     }
 
     uint32_t threads;
-    if (decisions->vec4) {
+    if (ggml_is_quantized(dst->type)) {
+        const uint32_t blocks_per_row = src->ne[0] / ggml_blck_size(dst->type);
+        threads =
+            (src->ne[1] * src->ne[2] * src->ne[3]) * (decisions->pair_blocks ? (blocks_per_row / 2) : blocks_per_row);
+    } else if (decisions->vec4) {
         threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
     } else {
         threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
@@ -3720,7 +3724,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
     ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
 }
 
-static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     wgpu::RequestAdapterOptions options = {};
 
 #ifndef __EMSCRIPTEN__
@@ -3758,10 +3762,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
     ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches();
     ctx->webgpu_global_ctx->vendor                    = info.vendor;
-    wgpu::SupportedFeatures features;
-    ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
-    // we require f16 support
-    GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
     ctx->webgpu_global_ctx->capabilities.supports_subgroups =
         ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
     // for dot4I8packed
@@ -3873,7 +3873,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
         "device_desc: %s\n",
         info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
         std::string(info.device).c_str(), std::string(info.description).c_str());
-    return true;
 }
 
 static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
@@ -4046,8 +4045,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                           (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32);
             break;
         case GGML_OP_SET_ROWS:
-            supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32) && src0->type == GGML_TYPE_F32 &&
-                           (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32));
+            supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_Q8_0 ||
+                            op->type == GGML_TYPE_Q4_0) &&
+                           src0->type == GGML_TYPE_F32 && (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32));
             break;
         case GGML_OP_GET_ROWS:
             if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_webgpu_supported_qtype(src0->type)) {
@@ -4502,7 +4502,12 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
             UINT64_MAX);
     }
 
-    if (adapter != nullptr) {
+    // WebGPU backend requires f16 support and, on native, implicit device synchronization.
+    if (adapter != nullptr && adapter.HasFeature(wgpu::FeatureName::ShaderF16)
+#ifndef __EMSCRIPTEN__
+        && adapter.HasFeature(wgpu::FeatureName::ImplicitDeviceSynchronization)
+#endif
+    ) {
         ctx->device_count = 1;
     }
 
@@ -4510,8 +4515,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
 }
 
 ggml_backend_t ggml_backend_webgpu_init(void) {
-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0);
-
+    ggml_backend_reg_t reg = ggml_backend_webgpu_reg();
+    if (ggml_backend_reg_dev_count(reg) == 0) {
+        return nullptr;
+    }
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, 0);
     return ggml_backend_webgpu_backend_init(dev, nullptr);
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
index 99e9192c71a..09f2f0eddb3 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
@@ -71,7 +71,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
         return;
     }
 
-    // getting the row from gid
     let elems_per_row = params.ne0 / VEC_SIZE;
     var i = gid.x / elems_per_row;
 
@@ -104,6 +103,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i_dst_row = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
     let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
 
-    let col_idx = (gid.x % elems_per_row);
-    dst[i_dst_row/VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row/VEC_SIZE + col_idx]);
+    let col_idx = gid.x % elems_per_row;
+    dst[i_dst_row / VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row / VEC_SIZE + col_idx]);
 }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl
new file mode 100644
index 00000000000..876e65b6ae1
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl
@@ -0,0 +1,224 @@
+#ifdef DST_Q8_0
+#define BLOCK_SIZE 32u
+#define BLOCK_BYTES 34u
+#define QS_WORDS 8u
+#elif defined(DST_Q4_0)
+#define BLOCK_SIZE 32u
+#define BLOCK_BYTES 18u
+#define QS_WORDS 4u
+#endif
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> idx: array<u32>;
+
+@group(0) @binding(2)
+#ifdef PAIR_BLOCKS
+var<storage, read_write> dst: array<u32>;
+#else
+var<storage, read_write> dst: array<atomic<u32>>;
+#endif
+
+#ifdef I64_IDX
+@group(0) @binding(3)
+var<storage, read_write> error: atomic<u32>;
+#define PARAMS_BINDING 4
+#else
+#define PARAMS_BINDING 3
+#endif
+
+struct Params {
+    offset_src: u32, // in elements
+    offset_idx: u32, // in elements
+    offset_dst: u32, // in blocks
+
+    // Strides (in elements / blocks)
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_idx0: u32,
+    stride_idx1: u32,
+    stride_idx2: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Shape of src
+    ne0: u32,
+    n_rows: u32,
+    ne2: u32,
+    ne3: u32,
+
+    // Shape of idx
+    idx1: u32,
+    idx2: u32,
+};
+
+@group(0) @binding(PARAMS_BINDING)
+var<uniform> params: Params;
+
+// if the quantization type is unaligned and there are an odd number of blocks per row, we need to store atomically
+#ifndef PAIR_BLOCKS
+fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) {
+    loop {
+        let old = atomicLoad(&dst[word_idx]);
+        let merged = (old & ~mask) | (bits & mask);
+        let result = atomicCompareExchangeWeak(&dst[word_idx], old, merged);
+        if (result.exchanged) {
+            return;
+        }
+    }
+}
+#else
+fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) {
+    let old = dst[word_idx];
+    dst[word_idx] = (old & ~mask) | (bits & mask);
+}
+#endif
+
+fn store_u16(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) {
+    let total_byte_offset = block_byte_offset + byte_offset;
+    let word_idx = dst_word_idx + total_byte_offset / 4u;
+    let shift = (total_byte_offset & 2u) * 8u;
+    let mask = 0xFFFFu << shift;
+    merge_store_dst_word(word_idx, mask, (value & 0xFFFFu) << shift);
+}
+
+fn store_u32(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) {
+    let total_byte_offset = block_byte_offset + byte_offset;
+    let word_idx = dst_word_idx + total_byte_offset / 4u;
+    let shift = (total_byte_offset & 3u) * 8u;
+
+    if (shift == 0u) {
+#ifdef PAIR_BLOCKS
+        dst[word_idx] = value;
+#else
+        atomicStore(&dst[word_idx], value);
+#endif
+        return;
+    }
+
+    let lo_mask = 0xFFFFFFFFu << shift;
+    let hi_mask = (1u << shift) - 1u;
+    merge_store_dst_word(word_idx, lo_mask, value << shift);
+    merge_store_dst_word(word_idx + 1u, hi_mask, value >> (32u - shift));
+}
+
+fn quantize_block_params(src_block: u32) -> vec2<f32> {
+#ifdef DST_Q8_0
+    var amax = 0.0;
+    for (var j: u32 = 0u; j < BLOCK_SIZE; j++) {
+        amax = max(amax, abs(src[src_block + j]));
+    }
+
+    let d = amax / 127.0;
+    let id = select(0.0, 1.0 / d, d > 0.0);
+    return vec2(d, id);
+#elif defined(DST_Q4_0)
+    var amax = 0.0;
+    var max_val = 0.0;
+    for (var j: u32 = 0u; j < BLOCK_SIZE; j++) {
+        let v = src[src_block + j];
+        let av = abs(v);
+        if (amax < av) {
+            amax = av;
+            max_val = v;
+        }
+    }
+
+    let d = max_val / -8.0;
+    let id = select(0.0, 1.0 / d, d != 0.0);
+    return vec2(d, id);
+#endif
+}
+
+fn quantize_block_word(src_block: u32, j: u32, id: f32) -> u32 {
+#ifdef DST_Q8_0
+    let base = src_block + j * 4u;
+    return (u32(i32(round(src[base + 0u] * id)) & 0xFF) << 0u) |
+           (u32(i32(round(src[base + 1u] * id)) & 0xFF) << 8u) |
+           (u32(i32(round(src[base + 2u] * id)) & 0xFF) << 16u) |
+           (u32(i32(round(src[base + 3u] * id)) & 0xFF) << 24u);
+#elif defined(DST_Q4_0)
+    var packed_q = 0u;
+    for (var k: u32 = 0u; k < 4u; k++) {
+        let x0 = src[src_block + j * 4u + k] * id;
+        let x1 = src[src_block + 16u + j * 4u + k] * id;
+        let q0 = u32(clamp(i32(x0 + 8.5), 0, 15));
+        let q1 = u32(clamp(i32(x1 + 8.5), 0, 15));
+        packed_q |= (q0 & 0xFu) << (8u * k);
+        packed_q |= (q1 & 0xFu) << (8u * k + 4u);
+    }
+    return packed_q;
+#endif
+}
+
+fn quantize_block(src_block: u32, dst_word_idx: u32, block_byte_offset: u32) {
+    let params = quantize_block_params(src_block);
+    let d = params.x;
+    let id = params.y;
+    let packed_d = pack2x16float(vec2(d, 0.0)) & 0xFFFFu;
+    store_u16(dst_word_idx, block_byte_offset, 0u, packed_d);
+
+    for (var j: u32 = 0u; j < QS_WORDS; j++) {
+        store_u32(dst_word_idx, block_byte_offset, 2u + j * 4u, quantize_block_word(src_block, j, id));
+    }
+}
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let blocks_per_row = params.ne0 / BLOCK_SIZE;
+#ifdef PAIR_BLOCKS
+    let blocks_per_invocation = 2u;
+#else
+    let blocks_per_invocation = 1u;
+#endif
+    let invocations_per_row = blocks_per_row / blocks_per_invocation;
+    let total_invocations = params.ne3 * params.ne2 * params.n_rows * invocations_per_row;
+    if (gid.x >= total_invocations) {
+        return;
+    }
+
+    var i = gid.x / invocations_per_row;
+    let block_in_row = (gid.x % invocations_per_row) * blocks_per_invocation;
+
+    let i_src3 = i / (params.ne2 * params.n_rows);
+    i = i % (params.ne2 * params.n_rows);
+    let i_src2 = i / params.n_rows;
+    let i_src1 = i % params.n_rows;
+
+    let i_idx2 = i_src3 % params.idx2;
+    let i_idx1 = i_src2 % params.idx1;
+    let i_idx0 = i_src1;
+
+#ifdef I64_IDX
+    let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2u;
+    let idx_val = idx[idx_high];
+    let idx_low_val = idx[idx_high + 1u];
+
+    if (idx_low_val != 0u) {
+        atomicStore(&error, 1u);
+        return;
+    }
+#else
+    let idx_i = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2;
+    let idx_val = idx[idx_i];
+#endif
+
+    let dst_row_blocks = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
+    let src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
+    let src_block = src_row + block_in_row * BLOCK_SIZE;
+    let dst_block_byte = (dst_row_blocks + block_in_row) * BLOCK_BYTES;
+
+    let dst_word_idx = dst_block_byte / 4u;
+#ifdef PAIR_BLOCKS
+    quantize_block(src_block, dst_word_idx, 0u);
+    quantize_block(src_block + BLOCK_SIZE, dst_word_idx, BLOCK_BYTES);
+#else
+    quantize_block(src_block, dst_word_idx, dst_block_byte & 3u);
+#endif
+}
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c3079795..8815c67d8bc 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5223,7 +5223,7 @@ static struct ggml_tensor * ggml_fill_impl(
     struct ggml_tensor  * a,
     float                 c,
     bool                  inplace) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
     GGML_ASSERT(ggml_is_contiguous(a));
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0189f6f03c5..5a567e2d159 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -451,6 +451,7 @@ class MODEL_ARCH(IntEnum):
     DEEPSEEK         = auto()
     DEEPSEEK2        = auto()
     DEEPSEEK2OCR     = auto()
+    DEEPSEEK32       = auto()
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
@@ -811,6 +812,8 @@ class MODEL_TENSOR(IntEnum):
     V_SAM_NET_3          = auto() # Deepseek-OCR
     V_ENC_EMBD_IMGNL     = auto() # Deepseek-OCR
     V_ENC_EMBD_VSEP      = auto() # Deepseek-OCR
+    V_RESMPL_QUERY_768   = auto() # Deepseek-OCR-2
+    V_RESMPL_QUERY_1024  = auto() # Deepseek-OCR-2
 
     # audio (mtmd)
     A_ENC_EMBD_POS        = auto()
@@ -967,6 +970,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK:         "deepseek",
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.DEEPSEEK2OCR:     "deepseek2-ocr",
+    MODEL_ARCH.DEEPSEEK32:       "deepseek32",
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.GLM4_MOE:         "glm4moe",
@@ -1327,6 +1331,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_SAM_NET_3:               "v.sam.net_3",
     MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR
     MODEL_TENSOR.V_ENC_EMBD_VSEP:           "v.view_seperator", # Deepseek-OCR
+    MODEL_TENSOR.V_RESMPL_QUERY_768:        "v.resample_query_768", # Deepseek-OCR-2 qwen2
+    MODEL_TENSOR.V_RESMPL_QUERY_1024:       "v.resample_query_1024", # Deepseek-OCR-2 qwen2
     # audio (mtmd)
     # note: all audio tensor names must use prefix "a." or "mm.a."
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
@@ -1505,6 +1511,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_SAM_NECK,
         MODEL_TENSOR.V_SAM_NET_2,
         MODEL_TENSOR.V_SAM_NET_3,
+        MODEL_TENSOR.V_RESMPL_QUERY_768,
+        MODEL_TENSOR.V_RESMPL_QUERY_1024,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_EMBD_NORM,
@@ -2930,6 +2938,46 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
     ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.INDEXER_K_NORM,
+        MODEL_TENSOR.INDEXER_PROJ,
+        MODEL_TENSOR.INDEXER_ATTN_K,
+        MODEL_TENSOR.INDEXER_ATTN_Q_B,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
+    ],
     MODEL_ARCH.ERNIE4_5_MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4077,6 +4125,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.CHATGLM: [
         MODEL_TENSOR.ROPE_FREQS,
     ],
@@ -4283,6 +4335,7 @@ class VisionProjectorType:
     JANUS_PRO = "janus_pro"
     DOTSOCR = "dots_ocr"
     DEEPSEEKOCR = "deepseekocr"
+    DEEPSEEKOCR2 = "deepseekocr2"
     LFM2A = "lfm2a" # audio
     MUSIC_FLAMINGO = "musicflamingo" # audio
     GLM4V = "glm4v"
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index ecc3c05f99a..444f0f2855a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1485,6 +1485,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
             "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
             "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1509,6 +1510,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
             "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1533,6 +1535,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
             "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
             "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1554,6 +1557,7 @@ class TensorNameMap:
             "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
             "vision_tower.blocks.{bid}.norm1", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1574,6 +1578,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
             "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
             "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
             "vision_tower.blocks.{bid}.attn.proj", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
@@ -1603,6 +1608,7 @@ class TensorNameMap:
             "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
             "vision_tower.blocks.{bid}.norm2", # dots.ocr
             "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1625,6 +1631,7 @@ class TensorNameMap:
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
             "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1632,6 +1639,7 @@ class TensorNameMap:
             "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
             "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
             "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
@@ -1652,6 +1660,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
             "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
             "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
             "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
         ),
@@ -1699,6 +1708,7 @@ class TensorNameMap:
             "vision_tower.encoder.final_layernorm", # kimi-vl
             "visual.post_layernorm", # glm4v
             "siglip2.vision_model.post_layernorm",
+            "model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
         ),
 
         MODEL_TENSOR.V_MM_POST_NORM: (
@@ -1879,6 +1889,14 @@ class TensorNameMap:
             "model.sam_model.net_3",
         ),
 
+        MODEL_TENSOR.V_RESMPL_QUERY_768: (
+            "model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
+        ),
+
+        MODEL_TENSOR.V_RESMPL_QUERY_1024: (
+            "model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
+        ),
+
         MODEL_TENSOR.V_MM_POST_FC_NORM: (
             "model.vision.linear_proj.norm1", # cogvlm
         ),
diff --git a/requirements/requirements-server-bench.txt b/requirements/requirements-server-bench.txt
index ea5849fa104..fb3b0d2664b 100644
--- a/requirements/requirements-server-bench.txt
+++ b/requirements/requirements-server-bench.txt
@@ -1,4 +1,4 @@
-datasets~=3.2.0
+datasets~=4.8.0
 matplotlib~=3.10.0
 numpy~=1.26.4
 requests~=2.32.3
diff --git a/scripts/server-bench.py b/scripts/server-bench.py
index 1b557a495a5..2eabb3bce85 100755
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -25,7 +25,7 @@ def get_prompts_text(dataset_name: str, n_prompts: int) -> Optional[list[str]]:
     ret = []
     if dataset_name.lower() == "mmlu":
         logger.info("Loading MMLU dataset...")
-        ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]  # type: ignore
+        ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"]
     else:
         return None
     if n_prompts >= 0:
diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py
index 3edaacd2749..aa1f20dcc23 100755
--- a/scripts/snapdragon/ggml-hexagon-profile.py
+++ b/scripts/snapdragon/ggml-hexagon-profile.py
@@ -24,7 +24,7 @@
 }
 
 op_pattern = re.compile(
-    r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
+    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
 )
 
 logger = logging.getLogger("ggml-hexagon-profile")
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index a4f87b2b9ae..538ef80bc7a 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-e705c5fed490514458bdd2eaddc43bd098fcce9b
+1e33fed33e87c43aa4c4078e2a9c239d4c1f1bd3
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b1fcfca0ad..d15ccfd99f1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,6 +24,7 @@ add_library(llama
             llama-io.cpp
             llama-kv-cache.cpp
             llama-kv-cache-iswa.cpp
+            llama-kv-cache-dsa.cpp
             llama-memory.cpp
             llama-memory-hybrid.cpp
             llama-memory-hybrid-iswa.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index e95ba6daac1..b485ac02e75 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK,         "deepseek"         },
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_DEEPSEEK2OCR,     "deepseek2-ocr"    },
+    { LLM_ARCH_DEEPSEEK32,       "deepseek32"       },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
@@ -904,6 +905,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_BITNET:
         case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 7c1dcc4d6c2..b59043e408f 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_DEEPSEEK2OCR,
+    LLM_ARCH_DEEPSEEK32,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fc027de8b39..e6ec3054daf 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -7,6 +7,7 @@
 
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@@ -29,7 +30,10 @@ static ggml_tensor * build_attn_inp_kq_mask(
     const auto n_tokens = ubatch.n_tokens;
     const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
 
-    ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+    // flash attention requires an f16 mask
+    const auto type = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    ggml_tensor * res = ggml_new_tensor_4d(ctx, type, n_kv, n_tokens/n_stream, 1, n_stream);
     ggml_set_input(res);
     ggml_set_name(res, "attn_inp_kq_mask");
 
@@ -102,6 +106,39 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_embd_h::set_input(const llama_ubatch * ubatch) {
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    if (ubatch->token) {
+        ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
+    } else {
+        // note: mtmd embedding input goes through here
+        GGML_ASSERT(ubatch->embd);
+        GGML_ASSERT(n_embd == embd->ne[0]);
+
+        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h));
+    }
+
+    // TODO: extend llama_ubatch to differentiate between token embeddings and hidden states
+    //       for now, we assume that the hidden state is always provided as an embedding
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/23643
+    if (ubatch->embd) {
+        GGML_ASSERT(n_embd == h->ne[0]);
+
+        ggml_backend_tensor_set(h, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h));
+    }
+}
+
+bool llm_graph_input_embd_h::can_reuse(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!params.ubatch.embd)  || (embd   && embd->ne[1]   == params.ubatch.n_tokens);
+    res &= (!params.ubatch.embd)  || (h      && h->ne[1]      == params.ubatch.n_tokens);
+
+    return res;
+}
+
 void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -348,7 +385,8 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+template <typename T>
+static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
     LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
     const char * swa_type_str = "unknown";
 
@@ -372,7 +410,7 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
     for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
         LLAMA_LOG_DEBUG(" %2d ", i);
         for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
-            float val = data[i * n_kv + j];
+            float val = llama_cast<float>(data[i * n_kv + j]);
             if (val == -INFINITY) {
                 LLAMA_LOG_DEBUG(" ∞");
             } else {
@@ -387,7 +425,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     const int64_t n_kv     = ubatch->n_tokens;
     const int64_t n_tokens = ubatch->n_tokens;
 
-    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
+    const auto fill_mask = [&](auto * data, int64_t ne, int n_swa, llama_swa_type swa_type) {
+        using T = std::remove_reference_t<decltype(*data)>;
+        std::fill(data, data + ne, llama_cast<T>(-INFINITY));
+
         for (int i1 = 0; i1 < n_tokens; ++i1) {
             const llama_seq_id s1 = ubatch->seq_id[i1][0];
             const llama_pos    p1 = ubatch->pos[i1];
@@ -413,38 +454,30 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
                     continue;
                 }
 
-                data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+                data[idst + i0] = llama_cast<T>(hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f);
             }
         }
-    };
-
-    {
-        GGML_ASSERT(self_kq_mask);
-        GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
-
-        float * data = (float *) self_kq_mask->data;
-
-        std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
-
-        fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
 
         if (debug) {
-            print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
+            print_mask(data, n_tokens, n_kv, n_swa, swa_type);
         }
+    };
+
+    GGML_ASSERT(self_kq_mask);
+    GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+    if (self_kq_mask->type == GGML_TYPE_F16) {
+        fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
+    } else {
+        fill_mask((float       *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE);
     }
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
         GGML_ASSERT(self_kq_mask_swa);
         GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
-
-        float * data = (float *) self_kq_mask_swa->data;
-
-        std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
-
-        fill_mask(data, hparams.n_swa, hparams.swa_type);
-
-        if (debug) {
-            print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
+        if (self_kq_mask_swa->type == GGML_TYPE_F16) {
+            fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
+        } else {
+            fill_mask((float       *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type);
         }
     }
 }
@@ -499,6 +532,34 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_attn_k_dsa::set_input(const llama_ubatch * ubatch) {
+    mctx->get_mla()->set_input_k_idxs(self_k_idxs_mla, ubatch);
+
+    mctx->get_mla()->set_input_kq_mask(self_kq_mask_mla, ubatch, cparams.causal_attn);
+
+    mctx->get_lid()->set_input_k_idxs(self_k_idxs_lid, ubatch);
+
+    mctx->get_lid()->set_input_kq_mask(self_kq_mask_lid, ubatch, cparams.causal_attn);
+
+    mctx->get_lid()->set_input_k_rot(self_k_rot_lid);
+}
+
+bool llm_graph_input_attn_k_dsa::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_dsa_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs_mla->ne[0] == params.ubatch.n_tokens;
+    res &= self_k_idxs_lid->ne[0] == params.ubatch.n_tokens;
+
+    res &= can_reuse_kq_mask(self_kq_mask_mla, mctx->get_mla(), params.ubatch, params.cparams);
+    res &= can_reuse_kq_mask(self_kq_mask_lid, mctx->get_lid(), params.ubatch, params.cparams);
+
+    return res;
+}
+
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
     // base tensors may not be allocated if there are no non-SWA attention layers
     if (self_k_idxs && self_k_idxs->buffer) {
@@ -568,23 +629,30 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
     GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
 
-    float * data = (float *) cross_kq_mask->data;
-
-    for (int i = 0; i < n_tokens; ++i) {
-        GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
-        for (int j = 0; j < n_enc; ++j) {
-            float f = -INFINITY;
+    const auto fill_mask = [&](auto * data) {
+        using T = std::remove_reference_t<decltype(*data)>;
+        for (int i = 0; i < n_tokens; ++i) {
+            GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
+            for (int j = 0; j < n_enc; ++j) {
+                float f = -INFINITY;
 
-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[i][s];
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];
 
-                if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
-                    f = 0.0f;
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                        f = 0.0f;
+                    }
                 }
-            }
 
-            data[i*n_enc + j] = f;
+                data[i*n_enc + j] = llama_cast<T>(f);
+            }
         }
+    };
+
+    if (cross_kq_mask->type == GGML_TYPE_F16) {
+        fill_mask((ggml_fp16_t *) cross_kq_mask->data);
+    } else {
+        fill_mask((float *) cross_kq_mask->data);
     }
 }
 
@@ -2088,17 +2156,20 @@ ggml_tensor * llm_graph_context::build_attn_mha(
 llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const {
     auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
 
+    // flash attention requires an f16 mask
+    const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
     ggml_set_input(inp->self_kq_mask);
 
-    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    inp->self_kq_mask_cnv = inp->self_kq_mask;
 
     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1);
         ggml_set_input(inp->self_kq_mask_swa);
 
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa;
     } else {
         inp->self_kq_mask_swa     = nullptr;
         inp->self_kq_mask_swa_cnv = nullptr;
@@ -2175,7 +2246,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0);
@@ -2282,7 +2353,7 @@ static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     return inp;
@@ -2354,6 +2425,82 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_k_dsa * inp,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * wo_s,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * sinks,
+        ggml_tensor * v_mla,
+        ggml_tensor * top_k,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    // expand k later to enable rope fusion which directly writes into k-v cache
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, v_cur);
+    ggml_build_forward_expand(gf, k_cur);
+
+    const auto * mctx_cur = inp->mctx->get_mla();
+
+    // store to KV cache
+    {
+        const auto & k_idxs = inp->get_k_idxs_mla();
+
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+    }
+
+    const auto & kq_mask = inp->get_kq_mask_mla();
+
+    // prepare new kq mask - starts filled with -INFINITY
+    ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY);
+
+    // reshape KQ mask into tensor with rows of size 1:
+    // [n_kv, n_batch, 1, n_stream] -> [1, n_kv, n_batch, n_stream]
+    kq_mask_all = ggml_view_4d(ctx0, kq_mask_all, 1, kq_mask_all->ne[0], kq_mask_all->ne[1], kq_mask_all->ne[3], kq_mask_all->nb[0], kq_mask_all->nb[1], kq_mask_all->nb[2], 0);
+
+    // reshape top_k indices: [n_top_k, n_batch, 1, n_stream] -> [n_top_k, n_batch, n_stream, 1]
+    ggml_tensor * top_k_3d = ggml_view_4d(ctx0, top_k, top_k->ne[0], top_k->ne[1], top_k->ne[3], 1, top_k->nb[1], top_k->nb[2], top_k->ne[3]*top_k->nb[3], 0);
+
+    // prepare zero-filled tensor with rows of size 1: [1, n_top_k, n_batch, n_stream]
+    // this will be our source of zero values for unmasking top k mask elements
+    ggml_tensor * zeros = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, top_k_3d->ne[0], top_k_3d->ne[1], top_k_3d->ne[2]);
+    zeros = ggml_fill(ctx0, zeros, 0.0f);
+
+    // modify KQ mask by unmasking elements that are in top_k indices
+    // ggml_set_rows([1, n_kv, n_batch, n_stream], [1, n_top_k, n_batch, n_stream], [n_top_k, n_batch, n_stream, 1])
+    ggml_tensor * kq_mask_top_k = ggml_set_rows(ctx0, kq_mask_all, zeros, top_k_3d);
+
+    // reshape to restore the original shape of KQ mask:
+    // [1, n_kv, n_batch, n_stream] -> [n_kv, n_batch, 1, n_stream]
+    kq_mask_top_k = ggml_view_4d(ctx0, kq_mask_top_k, kq_mask_top_k->ne[1], kq_mask_top_k->ne[2], 1, kq_mask_top_k->ne[3], kq_mask_top_k->nb[2], kq_mask_top_k->nb[3], kq_mask_top_k->nb[3], 0);
+
+    // combine with the original kq mask
+    kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask);
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_top_k, sinks, v_mla, kq_scale, il);
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur, wo_s);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_kv_iswa * inp,
         ggml_tensor * wo,
@@ -2446,10 +2593,13 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
 
     const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
 
-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
+    // flash attention requires an f16 mask
+    const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_enc, n_tokens, 1, 1);
     ggml_set_input(inp->cross_kq_mask);
 
-    inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
+    inp->cross_kq_mask_cnv = inp->cross_kq_mask;
 
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
@@ -2497,6 +2647,34 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
+llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const {
+    const auto * mctx_cur = static_cast<const llama_kv_cache_dsa_context *>(mctx);
+
+    auto inp = std::make_unique<llm_graph_input_attn_k_dsa>(hparams, cparams, mctx_cur);
+
+    {
+        inp->self_k_idxs_mla = mctx_cur->get_mla()->build_input_k_idxs(ctx0, ubatch);
+
+        inp->self_kq_mask_mla = build_attn_inp_kq_mask(ctx0, mctx_cur->get_mla(), ubatch, cparams);
+        inp->self_kq_mask_mla_cnv = inp->self_kq_mask_mla;
+    }
+
+    {
+        inp->self_k_idxs_lid = mctx_cur->get_lid()->build_input_k_idxs(ctx0, ubatch);
+
+        // ensure F32 mask
+        auto cparams_copy = cparams;
+        cparams_copy.flash_attn = false;
+
+        inp->self_kq_mask_lid = build_attn_inp_kq_mask(ctx0, mctx_cur->get_lid(), ubatch, cparams_copy);
+        inp->self_kq_mask_lid_cnv = inp->self_kq_mask_lid;
+
+        inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0);
+    }
+
+    return (llm_graph_input_attn_k_dsa *) res->add_input(std::move(inp));
+}
+
 // TODO: maybe separate the inner implementation into a separate function
 //       like with the non-sliding window equivalent
 //       once sliding-window hybrid caches are a thing.
@@ -2510,7 +2688,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
     }
 
     {
@@ -2520,7 +2698,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
         inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
         inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa;
     }
 
     inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0);
@@ -2689,7 +2867,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
         inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
 
         inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
-        inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
+        inp_attn->self_kq_mask_cnv = inp_attn->self_kq_mask;
     }
 
     {
@@ -2697,7 +2875,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
         inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
 
         inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
-        inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
+        inp_attn->self_kq_mask_swa_cnv = inp_attn->self_kq_mask_swa;
     }
 
     auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index bf6778237e6..eab82bd0d70 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -22,6 +22,7 @@ struct llama_layer;
 struct llama_memory_context_i;
 
 class llama_kv_cache_context;
+class llama_kv_cache_dsa_context;
 class llama_kv_cache_iswa_context;
 class llama_memory_recurrent_context;
 class llama_memory_hybrid_context;
@@ -121,6 +122,23 @@ class llm_graph_input_embd : public llm_graph_input_i {
     const int64_t n_embd = 0;
 };
 
+// similar to llm_graph_input_embd but with an additional hidden state input
+class llm_graph_input_embd_h : public llm_graph_input_i {
+public:
+    llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {}
+    virtual ~llm_graph_input_embd_h() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * tokens = nullptr; // I32 [n_batch]
+    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+    ggml_tensor * h      = nullptr; // F32 [n_embd, n_batch]
+
+    const int64_t n_embd = 0;
+};
+
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
     llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
@@ -274,10 +292,10 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
     ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
 
     // n_tokens == n_batch
-    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_tokens, n_batch/n_stream, 1, n_stream]
 
     const llama_hparams hparams;
     const llama_cparams cparams;
@@ -307,8 +325,8 @@ class llm_graph_input_attn_kv : public llm_graph_input_i {
     ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
     ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
 
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     // note: assumes v_rot^2 == I
     ggml_tensor * self_k_rot = nullptr;
@@ -347,8 +365,8 @@ class llm_graph_input_attn_k : public llm_graph_input_i {
 
     ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
 
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     const llama_hparams hparams;
     const llama_cparams cparams;
@@ -356,6 +374,44 @@ class llm_graph_input_attn_k : public llm_graph_input_i {
     const llama_kv_cache_context * mctx;
 };
 
+class llm_graph_input_attn_k_dsa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_k_dsa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_dsa_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
+    ~llm_graph_input_attn_k_dsa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * get_k_idxs_mla() const { return self_k_idxs_mla; }
+    ggml_tensor * get_k_idxs_lid() const { return self_k_idxs_lid; }
+
+    ggml_tensor * get_kq_mask_mla() const { return self_kq_mask_mla_cnv; }
+    ggml_tensor * get_kq_mask_lid() const { return self_kq_mask_lid; }
+
+    ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch]
+    ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch]
+
+    ggml_tensor * self_kq_mask_mla     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_mla_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid     = nullptr; // F32     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_lid_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
+
+    ggml_tensor * self_k_rot_lid = nullptr;
+
+    const llama_hparams hparams;
+    const llama_cparams cparams;
+
+    const llama_kv_cache_dsa_context * mctx;
+};
+
 class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
 public:
     llm_graph_input_attn_kv_iswa(
@@ -385,10 +441,10 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
     ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
     ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
 
-    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask         = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //         [n_kv, n_batch/n_stream, 1, n_stream]
 
     ggml_tensor * self_k_rot = nullptr;
     ggml_tensor * self_v_rot = nullptr;
@@ -411,8 +467,8 @@ class llm_graph_input_attn_cross : public llm_graph_input_i {
 
     ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
 
-    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
-    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+    ggml_tensor * cross_kq_mask     = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
+    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1]
 
     const llama_cross * cross = nullptr;
 };
@@ -956,6 +1012,23 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
 
+    llm_graph_input_attn_k_dsa * build_attn_inp_k_dsa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_k_dsa * inp,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * wo_s,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * sinks, // [n_head_q]
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+            ggml_tensor * top_k, // [n_indexer_top_k, n_tokens]
+                  float   kq_scale,
+                    int   il) const;
+
     llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
 
     // note: if k_cur or v_cur are not provided, they will not be stored in the memory
diff --git a/src/llama-impl.h b/src/llama-impl.h
index e4f35c8e53d..7923c3f7ed5 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -3,6 +3,7 @@
 #include "ggml.h" // for ggml_log_level
 
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #ifdef __GNUC__
@@ -40,6 +41,19 @@ struct no_init {
     no_init() = default;
 };
 
+template <typename dst_t, typename src_t>
+static inline dst_t llama_cast(src_t v) {
+    if constexpr (std::is_same_v<src_t, dst_t>) {
+        return v;
+    } else if constexpr (std::is_same_v<src_t, ggml_fp16_t> && std::is_same_v<dst_t, float>) {
+        return ggml_fp16_to_fp32(v);
+    } else if constexpr (std::is_same_v<src_t, float> && std::is_same_v<dst_t, ggml_fp16_t>) {
+        return ggml_fp32_to_fp16(v);
+    } else {
+        static_assert(std::is_same_v<dst_t, void>, "unsupported type combination");
+    }
+}
+
 struct time_meas {
     time_meas(int64_t & t_acc, bool disable = false);
     ~time_meas();
diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp
new file mode 100644
index 00000000000..e44004b5586
--- /dev/null
+++ b/src/llama-kv-cache-dsa.cpp
@@ -0,0 +1,261 @@
+#include "llama-kv-cache-dsa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_dsa
+//
+
+llama_kv_cache_dsa::llama_kv_cache_dsa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   unified,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) :
+    hparams_lid(model.hparams), n_stream(unified ? 1 : n_seq_max) {
+
+    LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_mla = std::make_unique<llama_kv_cache>(
+            model, model.hparams, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+
+    // we use llama_kv_cache for caching indexer keys
+    // by hand-tweaking some hparams we fool it to create
+    // indexer key cache tensors with correct dimensions
+    // https://github.com/ggml-org/llama.cpp/pull/21149#discussion_r3015940823
+
+    // DSA lightning indexer uses MQA with single key head
+    std::fill(hparams_lid.n_head_kv_arr.begin(), hparams_lid.n_head_kv_arr.end(), 1);
+    hparams_lid.n_embd_head_k_full = model.hparams.indexer_head_size;
+    hparams_lid.rope_type          = LLAMA_ROPE_TYPE_NEOX;
+
+    LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_lid = std::make_unique<llama_kv_cache>(
+            model, hparams_lid, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+}
+
+void llama_kv_cache_dsa::clear(bool data) {
+    kv_mla->clear(data);
+    kv_lid->clear(data);
+}
+
+bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_mla->seq_rm(seq_id, p0, p1);
+    res = res & kv_lid->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_mla->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_lid->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) {
+    kv_mla->seq_keep(seq_id);
+    kv_lid->seq_keep(seq_id);
+}
+
+void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_mla->seq_add(seq_id, p0, p1, shift);
+    kv_lid->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_mla->seq_div(seq_id, p0, p1, d);
+    kv_lid->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const {
+    return kv_mla->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_mla->seq_pos_max(seq_id);
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_dsa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_mla->memory_breakdown();
+    for (const auto & buft_size : kv_lid->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
+        auto sinfos_mla = kv_mla->prepare(ubatches);
+        if (sinfos_mla.empty()) {
+            break;
+        }
+
+        auto sinfos_lid = kv_lid->prepare(ubatches);
+        if (sinfos_lid.empty()) {
+            break;
+        }
+
+        assert(sinfos_mla.size() == sinfos_lid.size());
+
+        return std::make_unique<llama_kv_cache_dsa_context>(
+                this, std::move(sinfos_mla), std::move(sinfos_lid), std::move(ubatches));
+    } while (false);
+
+    return std::make_unique<llama_kv_cache_dsa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_full() {
+    return std::make_unique<llama_kv_cache_dsa_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_kv_cache_dsa_context>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_dsa::get_can_shift() const {
+    return kv_mla->get_can_shift() &&
+           kv_lid->get_can_shift() &&
+           kv_mla->get_size() == kv_lid->get_size();
+}
+
+void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    kv_mla->state_write(io, seq_id, flags);
+    kv_lid->state_write(io, seq_id, flags);
+}
+
+void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    kv_mla->state_read(io, seq_id, flags);
+    kv_lid->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache * llama_kv_cache_dsa::get_mla() const {
+    return kv_mla.get();
+}
+
+llama_kv_cache * llama_kv_cache_dsa::get_lid() const {
+    return kv_lid.get();
+}
+
+//
+// llama_kv_cache_dsa_context
+//
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv) :
+    ctx_mla(kv->get_mla()->init_full()),
+    ctx_lid(kv->get_lid()->init_full()),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        llama_context * lctx,
+        bool optimize) :
+    ctx_mla(kv->get_mla()->init_update(lctx, optimize)),
+    ctx_lid(kv->get_lid()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        slot_info_vec_t sinfos_mla,
+        slot_info_vec_t sinfos_lid,
+        std::vector<llama_ubatch> ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    ctx_mla(new llama_kv_cache_context(kv->get_mla(), std::move(sinfos_mla), this->ubatches)),
+    ctx_lid(new llama_kv_cache_context(kv->get_lid(), std::move(sinfos_lid), this->ubatches)),
+    status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) {
+}
+
+llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default;
+
+bool llama_kv_cache_dsa_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    ctx_mla->next();
+    ctx_lid->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_dsa_context::apply() {
+    assert(!llama_memory_status_is_fail(status));
+
+    bool res = true;
+
+    res = res & ctx_mla->apply();
+    res = res & ctx_lid->apply();
+
+    return res;
+}
+
+llama_memory_status llama_kv_cache_dsa_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_context * llama_kv_cache_dsa_context::get_mla() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_context *>(ctx_mla.get());
+}
+
+const llama_kv_cache_context * llama_kv_cache_dsa_context::get_lid()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_context *>(ctx_lid.get());
+}
diff --git a/src/llama-kv-cache-dsa.h b/src/llama-kv-cache-dsa.h
new file mode 100644
index 00000000000..e2b330993b8
--- /dev/null
+++ b/src/llama-kv-cache-dsa.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include "llama-kv-cache.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_dsa
+//
+
+// utilizes two instances of llama_kv_cache:
+// - the first instance is for caching key tensors of the model,
+// - the second instance is for caching lightning indexer key tensors
+
+class llama_kv_cache_dsa : public llama_memory_i {
+public:
+    llama_kv_cache_dsa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   unified,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_pad,
+                     uint32_t   n_swa,
+               llama_swa_type   swa_type,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
+
+    ~llama_kv_cache_dsa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+    //
+    // llama_kv_cache_dsa specific API
+    //
+
+    llama_kv_cache * get_mla() const;
+    llama_kv_cache * get_lid() const;
+
+private:
+    // we keep indexer KV cache hparams instance here as llama_kv_cache stores only reference to it
+    llama_hparams hparams_lid;
+    const uint32_t n_stream  = 1;
+
+    std::unique_ptr<llama_kv_cache> kv_mla;
+    std::unique_ptr<llama_kv_cache> kv_lid;
+};
+
+class llama_kv_cache_dsa_context : public llama_memory_context_i {
+public:
+    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+    // used for errors
+    llama_kv_cache_dsa_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv);
+
+    // used to create an update context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            llama_context * lctx,
+            bool optimize);
+
+    // used to create a batch processing context from a batch
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            slot_info_vec_t sinfos_base,
+            slot_info_vec_t sinfos_ik,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_dsa_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_dsa_context specific API
+    //
+
+    const llama_kv_cache_context * get_mla() const;
+    const llama_kv_cache_context * get_lid()  const;
+
+private:
+    //llama_kv_cache_dsa * kv;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_context_ptr ctx_mla;
+    const llama_memory_context_ptr ctx_lid;
+
+    const llama_memory_status status;
+};
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b..9b9f1790363 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
 
     kv_base = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a630..ac11f96c22d 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
 
 llama_kv_cache::llama_kv_cache(
         const llama_model & model,
+        const llama_hparams & hparams,
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
            llama_swa_type   swa_type,
     const layer_filter_cb & filter,
     const  layer_reuse_cb & reuse) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
+    model(model), hparams(hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
     GGML_ASSERT(kv_size % n_pad == 0);
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto & [buft, ctx] : ctx_map) {
         ggml_backend_buffer_t buf;
-        if (model.hparams.no_alloc) {
+        if (hparams.no_alloc) {
             buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
             for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
                 t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
@@ -293,6 +294,11 @@ llama_kv_cache::llama_kv_cache(
         ggml_is_quantized(type_k) &&
         hparams.n_embd_head_k() % 64 == 0;
 
+    // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer
+    if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) {
+        attn_rot_k = true;
+    }
+
     attn_rot_v =
         !attn_rot_disable &&
         n_embd_head_v_all > 0 &&
@@ -1430,8 +1436,8 @@ struct args_set_input_kq_mask {
     int64_t n_tps;
 };
 
-template<bool causal, bool swa, bool is_2d, bool alibi>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa, bool is_2d, bool alibi>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
   //const auto & hparams = args.hparams;
     const auto & ubatch  = args.ubatch;
 
@@ -1445,6 +1451,9 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
     const int64_t n_stream = args.n_stream;
     const int64_t n_tps    = args.n_tps;
 
+    const T mask_keep = llama_cast<T>(0.0f);
+    const T mask_drop = llama_cast<T>(-INFINITY);
+
     // the min position in the batch for each sequence
     llama_pos seq_pos_min[LLAMA_MAX_SEQ];
     std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
@@ -1563,46 +1572,55 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
                 }
 
                 if (alibi) {
-                    data[idst + j] = -std::abs(p0 - p1);
+                    data[idst + j] = llama_cast<T>(static_cast<float>(-std::abs(p0 - p1)));
                 } else {
-                    data[idst + j] = 0.0f;
+                    data[idst + j] = mask_keep;
                 }
 
                 continue;
 skip:
-                data[idst + j] = -INFINITY;
+                data[idst + j] = mask_drop;
             }
         }
     }
 }
 
-template<bool causal, bool swa, bool is_2d>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa, bool is_2d>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool alibi = args.hparams.use_alibi;
     if (alibi) {
-        set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
+        set_input_kq_mask_impl<T, causal, swa, is_2d, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
+        set_input_kq_mask_impl<T, causal, swa, is_2d, false>(args, data);
     }
 }
 
-template<bool causal, bool swa>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal, bool swa>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool is_2d = args.ubatch->is_pos_2d();
     if (is_2d) {
-        set_input_kq_mask_impl<causal, swa, true> (args, data);
+        set_input_kq_mask_impl<T, causal, swa, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, swa, false>(args, data);
+        set_input_kq_mask_impl<T, causal, swa, false>(args, data);
     }
 }
 
-template<bool causal>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
+template<typename T, bool causal>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) {
     const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
     if (swa) {
-        set_input_kq_mask_impl<causal, true> (args, data);
+        set_input_kq_mask_impl<T, causal, true> (args, data);
+    } else {
+        set_input_kq_mask_impl<T, causal, false>(args, data);
+    }
+}
+
+template<typename T>
+static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data, bool causal_attn) {
+    if (causal_attn) {
+        set_input_kq_mask_impl<T, true> (args, data);
     } else {
-        set_input_kq_mask_impl<causal, false>(args, data);
+        set_input_kq_mask_impl<T, false>(args, data);
     }
 }
 
@@ -1610,7 +1628,6 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
     const uint32_t n_tokens = ubatch->n_tokens;
 
     GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    float * data = (float *) dst->data;
 
     const int64_t n_kv     = dst->ne[0];
     const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
@@ -1634,10 +1651,10 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
         /*.n_tps            =*/ n_tps,
     };
 
-    if (causal_attn) {
-        set_input_kq_mask_impl<true> (args, data);
+    if (dst->type == GGML_TYPE_F16) {
+        set_input_kq_mask_impl<ggml_fp16_t>(args, (ggml_fp16_t *) dst->data, causal_attn);
     } else {
-        set_input_kq_mask_impl<false>(args, data);
+        set_input_kq_mask_impl<float>(args, (float *) dst->data, causal_attn);
     }
 
     //const int64_t t_end = ggml_time_us();
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b232..649269af6dd 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -93,8 +93,12 @@ class llama_kv_cache : public llama_memory_i {
 
     using slot_info_vec_t = std::vector<slot_info>;
 
+    // TODO: refactor the memory instances to not depend on `llama_model`
+    //       instead pass all necessary info (e.g. hparams, dev layers, arch, etc.) directly
+    //       likely through `struct llama_memory_params`
     llama_kv_cache(
             const llama_model & model,
+            const llama_hparams & hparams,
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   v_trans,
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
index 33b3b395e0c..6bd2ec18ce3 100644
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
+        model.hparams,
         type_k,
         type_v,
         v_trans,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0c3e03a61dc..914fc423b1f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -10,6 +10,7 @@
 
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@@ -172,6 +173,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_deepseek2(params);
         case LLM_ARCH_DEEPSEEK2OCR:
             return new llama_model_deepseek2ocr(params);
+        case LLM_ARCH_DEEPSEEK32:
+            return new llama_model_deepseek32(params);
         case LLM_ARCH_GLM_DSA:
             return new llama_model_glm_dsa(params);
         case LLM_ARCH_MISTRAL4:
@@ -407,16 +410,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
     auto get_tensor_config = [&]() -> tensor_config {
         // standard attention
         if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight");
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_qkv_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if ( std::regex_match(tensor_name, pattern_qkv_bias)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_qk_norm)) {
             return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight");
@@ -432,7 +435,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
         }
 
         if (std::regex_match(tensor_name, pattern_attn_gate_weight)) {
-            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1);
+            return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight");
         }
         if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) {
             return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight");
@@ -779,6 +782,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_310B_A15B:     return "310B.A15B";
         case LLM_TYPE_355B_A32B:     return "355B.A32B";
         case LLM_TYPE_397B_A17B:     return "397B.A17B";
+        case LLM_TYPE_685B_A37B:     return "685B.A37B";
         case LLM_TYPE_744B_A40B:     return "744B.A40B";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
@@ -1769,7 +1773,7 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
         }
 
-        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
+        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
             LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
             LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
             LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
@@ -1957,6 +1961,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = nullptr;
             } break;
+        case LLM_ARCH_DEEPSEEK32:
+            {
+                res = new llama_kv_cache_dsa(
+                        *this,
+                        params.type_k,
+                        params.type_v,
+                        !cparams.flash_attn,
+                        cparams.offload_kqv,
+                        cparams.kv_unified,
+                        cparams.n_ctx_seq,
+                        cparams.n_seq_max,
+                        1,
+                        hparams.n_swa,
+                        hparams.swa_type,
+                        nullptr,
+                        nullptr);
+            } break;
         // Models that need standard caching should rely on recurrent/hybrid
         // checks
         default:
@@ -2083,6 +2104,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                         res = new llama_kv_cache(
                                 *this,
+                                hparams,
                                 params.type_k,
                                 params.type_v,
                                 !cparams.flash_attn,
@@ -2272,6 +2294,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_DEEPSEEK2OCR:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
diff --git a/src/llama-model.h b/src/llama-model.h
index b797b8966ac..743feb970d9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -137,6 +137,7 @@ enum llm_type {
     LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
     LLM_TYPE_355B_A32B, // GLM-4.5
     LLM_TYPE_397B_A17B, // Qwen3.5
+    LLM_TYPE_685B_A37B, // DeepSeek V3.2
     LLM_TYPE_744B_A40B, // GLM-5
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
diff --git a/src/llama.cpp b/src/llama.cpp
index dfe30ce8f61..edacd1d5f42 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -239,8 +239,9 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama
         // add GPUs
         model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
 
-        // add integrated GPUs only if no other devices were found
-        if (model->devices.empty()) {
+        // add integrated GPUs only if no discrete GPUs were found
+        // (RPC servers do not count, otherwise the local iGPU would be dropped on iGPU+RPC setups)
+        if (gpus.empty()) {
             model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
         }
     }
diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp
new file mode 100644
index 00000000000..c92ab60d166
--- /dev/null
+++ b/src/models/deepseek32.cpp
@@ -0,0 +1,503 @@
+#include "models.h"
+
+#include "llama-kv-cache.h"
+#include "llama-kv-cache-dsa.h"
+
+void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+    hparams.f_norm_eps = 1e-6;  // eps for layer norm
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // MoE parameters
+    ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+    // deepseek MLA parameters
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+
+    // DSA parameters
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
+
+    // Expert gating function
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+
+    if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        // cancel the factor from the convert script
+        hparams.rope_yarn_log_mul /= 0.1f;
+    }
+
+    // NextN/MTP parameters
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 62: type = LLM_TYPE_685B_A37B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const bool is_mla = hparams.is_mla();
+    if (!is_mla) {
+        throw std::runtime_error("DEEPSEEK32 architecture requires MLA");
+    }
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+
+    const int64_t q_lora_rank  = hparams.n_lora_q;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
+            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
+
+        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
+
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
+
+        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
+        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+        // DSA indexer
+        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
+        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
+        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
+        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
+        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+            // Optional tensors
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek32::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context(params) {
+    const bool is_mla = hparams.is_mla();
+    GGML_ASSERT(is_mla);
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+    GGML_UNUSED(n_embd_head_v);
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+    const int64_t n_indexer_head = hparams.indexer_n_head;
+    const int64_t n_embd_indexer_head = hparams.indexer_head_size;
+    const int64_t n_embd_indexer_head_rope = hparams.n_rot();
+    const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope;
+    const uint32_t n_indexer_top_k = hparams.indexer_top_k;
+
+    const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // {n_embd, n_tokens}
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    llm_graph_input_attn_k_dsa * inp_attn_dsa = build_attn_inp_k_dsa();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
+    for (int il = 0; il < effective_n_layers; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self_attention
+        {
+            ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+            cb(qr, "qr", il);
+
+            qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(qr, "qr", il);
+
+            ggml_tensor * top_k = nullptr;
+
+            // lightning indexer
+            {
+                ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr);
+                cb(indexer_q, "indexer_q", il);
+
+                // split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_pe =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0);
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // and {n_embd_indexer_head_nope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_nope =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head_nope));
+                cb(indexer_q_nope, "indexer_q_nope", il);
+
+                indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot,
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens}
+                indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0);
+                cb(indexer_q, "indexer_q", il);
+
+                ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur);
+                cb(indexer_k, "indexer_k", il);
+
+                indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il);
+                cb(indexer_k, "indexer_k", il);
+
+                // split into {n_embd_indexer_head_rope, 1, n_tokens}
+                ggml_tensor * indexer_k_pe =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0);
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // and {n_embd_indexer_head_nope, 1, n_tokens}
+                ggml_tensor * indexer_k_nope =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head),
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head_nope));
+                cb(indexer_k_nope, "indexer_k_nope", il);
+
+                indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot,
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens}
+                indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0);
+                cb(indexer_k, "indexer_k", il);
+
+                // perform Hadamard transform on indexer q and k
+                indexer_q = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_q);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_k);
+                cb(indexer_k, "indexer_k", il);
+
+                // store indexer keys to KV cache
+                const auto * mctx_lid = inp_attn_dsa->mctx->get_lid();
+                const auto & k_idxs_lid = inp_attn_dsa->get_k_idxs_lid();
+                ggml_build_forward_expand(gf, mctx_lid->cpy_k(ctx0, indexer_k, k_idxs_lid, il));
+
+                // prepare indexer weights
+                ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur);
+                cb(indexer_weights, "indexer_weights", il);
+
+                // get cached indexer keys
+                indexer_k = mctx_lid->get_k(ctx0, il);
+
+                // split the batch into streams if needed
+                const auto n_stream = indexer_k->ne[3];
+                indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0);
+                indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
+
+                // calculate indexer kq
+                indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
+                cb(indexer_k, "indexer_k", il);
+
+                ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
+                cb(indexer_kq, "indexer_kq", il);
+
+                // ReLU requires contiguous tensors
+                indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
+                cb(indexer_kq, "indexer_kq", il);
+
+                // apply ReLU
+                ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
+                cb(indexer_score, "indexer_score", il);
+
+                // pre-scale weights to avoid scaling operations on huge indexer_score tensor
+                indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_embd_indexer_head * n_indexer_head)));
+                cb(indexer_weights, "indexer_weights", il);
+
+                // multiply scores by indexer weights
+                indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
+                cb(indexer_score, "indexer_score", il);
+
+                // sum by q n_indexer_head dimension
+                indexer_score = ggml_sum_rows(ctx0, indexer_score);
+                cb(indexer_score, "indexer_score", il);
+
+                // permute result to match KQ mask
+                indexer_score = ggml_cont(ctx0, ggml_permute(ctx0, indexer_score, 2, 1, 0, 3));
+                cb(indexer_score, "indexer_score", il);
+
+                // mask indexer scores
+                ggml_tensor * indexer_kq_mask = inp_attn_dsa->get_kq_mask_lid();
+                indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask);
+                cb(indexer_score, "indexer_score", il);
+
+                // get indices of top k indexer scores
+                uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k;
+                top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k));
+                cb(top_k, "top_k", il);
+            }
+
+            ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr);
+            cb(q, "q", il);
+
+            // split into {n_embd_head_qk_nope, n_head, n_tokens}
+            ggml_tensor * q_nope =
+                ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                             ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
+            cb(q_nope, "q_nope", il);
+
+            // and {n_embd_head_qk_rope, n_head, n_tokens}
+            ggml_tensor * q_pe = ggml_view_3d(
+                ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
+            cb(q_pe, "q_pe", il);
+
+            ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+            cb(kv_cmpr_pe, "kv_cmpr_pe", il);
+
+            // split into {kv_lora_rank, n_tokens}
+            ggml_tensor * kv_cmpr =
+                ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+                             ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // and {n_embd_head_qk_rope, 1, n_tokens}
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+            cb(k_pe, "k_pe", il);
+
+            q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(q_pe, "q_pe", il);
+
+            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(k_pe, "k_pe", il);
+
+            kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // MLA attention
+            {
+                // {n_embd_head_qk_nope, n_tokens, n_head}
+                q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+                cb(q_nope, "q_nope_perm", il);
+
+                // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+                ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+                cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+                // {kv_lora_rank, n_head, n_tokens}
+                q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+                cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+                // note: rope must go first for in-place context shifting in build_rope_shift()
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+                cb(Qcur, "Qcur", il);
+
+                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+                cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+                cb(Kcur, "Kcur", il);
+
+                // {kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Vcur = kv_cmpr;
+                cb(Vcur, "Vcur", il);
+
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
+                cur = build_attn(inp_attn_dsa,
+                        model.layers[il].wo, NULL, model.layers[il].wo_s,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
+            }
+        }
+        if (il == effective_n_layers - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            cur = build_ffn(cur,
+                model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
+                model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
+                model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE branch
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il,
+                nullptr,
+                model.layers[il].ffn_gate_up_exps,
+                model.layers[il].ffn_up_exps_s,
+                model.layers[il].ffn_gate_exps_s,
+                model.layers[il].ffn_down_exps_s);
+            cb(moe_out, "ffn_moe_out", il);
+
+            // FFN shared expert
+            {
+                ggml_tensor * ffn_shexp =
+                    build_ffn(cur,
+                        model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
+                        model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
+                        model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+
+                cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                cb(cur, "ffn_out", il);
+            }
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index db228865d5d..5251e2d8280 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1030,6 +1030,19 @@ struct llama_model_deepseek2 : public llama_model_base {
 };
 
 
+struct llama_model_deepseek32 : public llama_model_base {
+    llama_model_deepseek32(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_deepseek2ocr : public llama_model_base {
     llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 04ecc18fcdc..ba63ae441df 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -508,28 +508,41 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     int sections[4];
     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
 
     inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_set_input(inp->tokens);
 
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
     ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
 
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
 
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
     cb(tok_embd, "mtp_tok_embd", il);
 
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
     res->add_input(std::move(inp));
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
-    auto * inp_attn           = build_attn_inp_kv();
 
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
 
     ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index dc24f6ed537..4f87d55d911 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -571,29 +571,41 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     int sections[4];
     std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+    // TODO: extract in a common llm_graph_context::build_inp_embd_h()
+    auto inp = std::make_unique<llm_graph_input_embd_h>(hparams.n_embd);
 
     inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_set_input(inp->tokens);
 
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens);
     ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
 
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+    // TODO: make static using `ggml_build_forward_select()`
+    //       see llm_graph_context::build_inp_embd() for reference
+    ggml_tensor * tok_embd;
+    if (ubatch.token) {
+        ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
 
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+        tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    } else {
+        tok_embd = inp->embd;
+    }
     cb(tok_embd, "mtp_tok_embd", il);
 
+    inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->h);
+    ggml_set_name(inp->h, "mtp_h_input");
+
+    ggml_tensor * h_embd = inp->h;
+
     res->add_input(std::move(inp));
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
-    auto * inp_attn           = build_attn_inp_kv();
 
+    auto * inp_attn = build_attn_inp_kv();
 
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
     cb(h_norm, "mtp_hnorm", il);
 
     ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 19f8558d897..58c5fdd10db 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2415,6 +2415,15 @@ struct test_set_rows : public test_case {
         }
         return 1e-7;
     }
+
+    // See dicussion here: https://github.com/ggml-org/llama.cpp/pull/23760#issuecomment-4566312209
+    double max_nmse_err(ggml_backend_t backend) override {
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend));
+        if (type == GGML_TYPE_Q8_0 && strcmp(ggml_backend_reg_name(reg), "WebGPU") == 0) {
+            return std::max(test_case::max_nmse_err(backend), 2e-7);
+        }
+        return test_case::max_nmse_err(backend);
+    }
 };
 
 // GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS
@@ -7803,6 +7812,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 1, 2}, {32, 33, 1, 2}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 2, 1}, {33, 34, 2, 1}, 1, 1, 1, 1, 1, 1, true));
 
     // im2col 3D
     test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 16af11a2862..1def7faff60 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -12,6 +12,7 @@
 #include "../src/llama-model-saver.h"
 
 #include <cinttypes>
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <cstdint>
@@ -99,6 +100,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
         n_ff   = 96;
         n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
     } else if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_DEEPSEEK32
             || arch == LLM_ARCH_GLM_DSA
             || arch == LLM_ARCH_KIMI_LINEAR
             || arch == LLM_ARCH_MISTRAL4) {
@@ -155,6 +157,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
 
     ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
     if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_DEEPSEEK32
             || arch == LLM_ARCH_GLM_DSA
             || arch == LLM_ARCH_KIMI_LINEAR
             || arch == LLM_ARCH_MISTRAL4) {
@@ -331,6 +334,7 @@ static bool moe_mandatory(const llm_arch arch) {
         case LLM_ARCH_ARCTIC:
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK32:
         case LLM_ARCH_GLM4_MOE:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_EXAONE_MOE:
@@ -497,6 +501,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
     };
 
     std::vector<device_config> dev_configs;
+    size_t max_device_label_length = 4;
     {
         std::vector<ggml_backend_dev_t> devices_meta;
         {
@@ -504,6 +509,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
             for (size_t i = 0; i < device_count; i++) {
                 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
                 dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
+                max_device_label_length = std::max(max_device_label_length, dev_configs.back().label.length());
 
                 // cpu-based devices cannot be used in tensor split mode
                 if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
@@ -515,10 +521,27 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
     }
 
+    size_t max_arch_name_length = 0;
+    for (const llm_arch & arch : llm_arch_all()) {
+        max_arch_name_length = std::max(max_arch_name_length, strlen(llm_arch_name(arch)));
+    }
+
+    const std::string template_header  = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n";
+    const std::string template_row_cfg = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|";
+    const std::string template_row_res = "%15s %10s|%20s|\n";
+
     bool all_ok = true;
     common_log_flush(common_log_main());
-    printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
-    printf("|----------------|------------------------------|------|---------------|---------|\n");
+    printf(template_header.c_str(), "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
+    printf("|");
+    for (size_t i = 0; i < max_arch_name_length; i++) {
+        printf("-");
+    }
+    printf("|");
+    for (size_t i = 0; i < max_device_label_length; i++) {
+        printf("-");
+    }
+    printf("|------|---------------|---------|\n");
     for (const llm_arch & arch : llm_arch_all()) {
         if (arch == LLM_ARCH_UNKNOWN) {
             continue;
@@ -543,6 +566,11 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
             std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
             std::vector<float> logits_cpu;
             for (device_config & dc : dev_configs) {
+                // print test config first; should anything fail during model loading or inference, at least we know which test case caused it
+                printf(template_row_cfg.c_str(),
+                    llm_arch_name(arch), dc.label.c_str(), config_name.c_str());
+                fflush(stdout);
+
                 std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
                 std::vector<float> logits_dev;
                 std::string status_nmse      = "\033[1;33mSKIP\033[0m";
@@ -595,8 +623,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
                     }
                 }
 
-                printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
-                    config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
+                // log the results for this test case
+                printf(template_row_res.c_str(),
+                    status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
             }
         }
     }
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 04aef018870..b11aa45ce95 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -12,7 +12,6 @@
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
@@ -171,8 +170,8 @@
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index e8a1287f3a1..d90f8174866 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -95,7 +95,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
@@ -254,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index ffd30c7e6a1..14808d4221d 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -40,6 +40,7 @@ add_library(mtmd
             models/siglip.cpp
             models/whisper-enc.cpp
             models/deepseekocr.cpp
+            models/deepseekocr2.cpp
             models/mobilenetv5.cpp
             models/youtuvl.cpp
             models/yasa2.cpp
diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index c5e880c71ec..1d9f6a136a9 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -29,6 +29,7 @@ struct clip_graph {
     const int n_patches;
     const int n_embd;
     const int n_head;
+    const int n_head_kv;
     const int d_head;
     const int n_layer;
     const int n_mmproj_embd;
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index ef4c342ba86..14398dc4869 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -188,6 +188,8 @@
 #define TN_SAM_FFN_DOWN   "v.sam.blk.%d.mlp.lin2.%s"
 #define TN_SAM_NECK       "v.sam.neck.%d.%s"
 #define TN_SAM_NET        "v.sam.net_%d.%s"
+// deepseek-ocr-2
+#define TN_RESMPL_QUERY  "v.resample_query_%d.%s"
 // (conformer) lfm2
 #define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
 #define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
@@ -337,6 +339,7 @@ enum projector_type {
     PROJECTOR_TYPE_JANUS_PRO,
     PROJECTOR_TYPE_DOTS_OCR,
     PROJECTOR_TYPE_DEEPSEEKOCR,
+    PROJECTOR_TYPE_DEEPSEEKOCR2,
     PROJECTOR_TYPE_LFM2A,
     PROJECTOR_TYPE_GLM4V,
     PROJECTOR_TYPE_YOUTUVL,
@@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
     { PROJECTOR_TYPE_DOTS_OCR,  "dots_ocr"},
     { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
+    { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
     { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
     { PROJECTOR_TYPE_GLM4V,     "glm4v"},
     { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
@@ -424,6 +428,9 @@ struct clip_image_f32 {
     int ny;
 
     std::vector<float> buf;
+
+    // marks the global view in e.g., DeepSeek-OCR Models
+    bool add_viewsep = false;
 };
 
 //
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index e0de41e0b5b..1f3657a8507 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -542,6 +542,11 @@ struct clip_model {
     int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
 
     std::vector<clip_layer> sam_layers;
+
+    // deepseek-ocr-2
+    ggml_tensor * resample_query_768 = nullptr;
+    ggml_tensor * resample_query_1024 = nullptr;
+
     // lfm2 audio
     std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
     std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 5fd583d40bc..7bb702b95c4 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -246,6 +246,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
         n_patches(n_patches_x * n_patches_y),
         n_embd(hparams.n_embd),
         n_head(hparams.n_head),
+        n_head_kv(hparams.n_head_kv),
         d_head(n_embd / n_head),
         n_layer(hparams.n_layer),
         n_mmproj_embd(clip_n_mmproj_embd(ctx)),
@@ -401,9 +402,9 @@ ggml_tensor * clip_graph::build_vit(
                     }
                 }
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head,    n_pos);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head_kv, n_pos);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head_kv, n_pos);
 
                 if (norm_per_head) {
                     if (layer.q_norm) {
@@ -952,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+             {
+                builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
+            } break;
         case PROJECTOR_TYPE_LFM2A:
             {
                 builder = std::make_unique<clip_graph_conformer>(ctx, img);
@@ -1120,6 +1125,9 @@ struct clip_model_loader {
             get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
             get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
 
+            // n_head_kv is optional (for GQA), default to n_head
+            hparams.n_head_kv = hparams.n_head;
+
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
@@ -1510,6 +1518,7 @@ struct clip_model_loader {
                         hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_DEEPSEEKOCR:
+                case PROJECTOR_TYPE_DEEPSEEKOCR2:
                     {
                         hparams.patch_size = 16;
                         hparams.image_size = 1024;
@@ -1521,6 +1530,10 @@ struct clip_model_loader {
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
+                            get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
+                        }
                      } break;
                 case PROJECTOR_TYPE_HUNYUANVL:
                     {
@@ -1552,6 +1565,9 @@ struct clip_model_loader {
                         hparams.audio_n_fft            = 512;
                         hparams.audio_window_len       = 320;  // 20ms frame (NOT 25ms/400)
                         hparams.audio_hop_len          = 160;
+                        // due to a mistake in the original conversion code, rms_norm_eps is set to a wrong value
+                        // since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion
+                        hparams.eps = 1e-6f;
                     } break;
                 case PROJECTOR_TYPE_GRANITE_SPEECH:
                     {
@@ -2367,6 +2383,7 @@ struct clip_model_loader {
                     model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                 } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR:
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
                 {
                     model.pos_embed          = get_tensor(string_format(TN_SAM_POS_EMBD,   "weight"));
                     model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
@@ -2397,10 +2414,12 @@ struct clip_model_loader {
                     model.neck_3_w       = get_tensor(string_format(TN_SAM_NECK, 3, "weight"));
                     model.net_2          = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
                     model.net_3          = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
-                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE);
+                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE, false);
                     model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR);
                     model.mm_fc_w        = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
                     model.mm_fc_b        = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+                    model.resample_query_768  = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false);
+                    model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false);
                  } break;
             case PROJECTOR_TYPE_GEMMA4A:
                 {
@@ -3270,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_DEEPSEEKOCR:
         {
             // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
-            // which reduces spatial dimensions by 4x in each direction (16x total)
+            // that reduce spatial dimensions by 4x in each direction (16x total)
             // E.g., 64x64 -> 16x16 patches
             n_patches /= 16;
 
@@ -3286,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 int oh = (img->ny / patch_size) / merge;
                 n_patches = (ow + 1) * oh + 2;
             } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+        {
+            // 1024 global view -> 256 query tokens + 1 view separator = 257;
+            // 768 local tile   -> 144 query tokens, no separator.
+            n_patches /= 16;
+            if (img->add_viewsep) {
+                n_patches += 1; // view separator, appended only after the global view
+            }
+        } break;
         case PROJECTOR_TYPE_LFM2A:
             {
                 n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@@ -3875,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 set_input_i32("pos_y", pos_y);
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
             {
                 GGML_ASSERT(pos_w == pos_h);
 
@@ -3897,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
                 set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
                 set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
+
+                if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+
+                    // qwen2 encoder attention mask
+
+                    // num_image_tokens = num_patches / 16
+                    //   256 for 1024 global view
+                    //   144 for 768 tile views
+                    const int   num_image_tokens = num_patches / 16;
+                    const int   seq_len          = num_image_tokens * 2;
+                    std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
+
+                    // attention mask layout
+                    //  +--------------+---------------+
+                    //  |    all 0     |   all -inf    |
+                    //  +--------------+---------------+
+                    //  |    all 0     |  lower tri 0  |
+                    //  +--------------+---------------+
+                    for (int i = 0; i < seq_len; i++) {
+                        for (int j = 0; j < seq_len; j++) {
+                            const bool zero = i < num_image_tokens ?
+                                                     j < num_image_tokens :
+                                                     j < num_image_tokens || j <= i;
+                            qwen2_mask[static_cast<size_t>(i) * seq_len + j] = zero ? 0.0f : -1e9f;
+                        }
+                    }
+                    set_input_f32("qwen2_attn_mask", qwen2_mask);
+                }
             } break;
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_GEMMA3NV:
@@ -4249,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_COGVLM:
             return ctx->model.mm_4h_to_h_w->ne[1];
         case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
             return ctx->model.mm_fc_w->ne[1];
         case PROJECTOR_TYPE_LFM2A:
             return ctx->model.position_embeddings->ne[0];
diff --git a/tools/mtmd/debug/mtmd-debug.cpp b/tools/mtmd/debug/mtmd-debug.cpp
index f19ca4cfe29..b88a16f0f8b 100644
--- a/tools/mtmd/debug/mtmd-debug.cpp
+++ b/tools/mtmd/debug/mtmd-debug.cpp
@@ -30,7 +30,9 @@ static void show_additional_info(int /*argc*/, char ** argv) {
         "    -p \"encode\" (debugging encode pass, default case):\n"
         "        --image can be:\n"
         "          \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"red\", \"green\", \"blue\": filled with respective colors\n"
         "          \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n"
+        "          \"rainbow\": raspberry-pi-like rainbow pattern\n"
         "        --audio can be:\n"
         "          \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
         "          \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n"
@@ -144,6 +146,65 @@ int main(int argc, char ** argv) {
                     image[y][x * 3 + 2] = v;
                 }
             }
+        } else if (input == "red") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 0] = 1.0f;  // R channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "green") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 1] = 1.0f;  // G channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "blue") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                for (int j = 0; j < inp_size; ++j) {
+                    row[j * 3 + 2] = 1.0f;  // B channel
+                }
+                image.push_back(row);
+            }
+        } else if (input == "rainbow") {
+            for (int i = 0; i < inp_size; ++i) {
+                image.push_back(std::vector<float>(inp_size * 3, 0.0f));
+            }
+            float cx = inp_size / 2.0f;
+            float cy = inp_size / 2.0f;
+            float max_dist = std::sqrt(cx * cx + cy * cy);
+            for (int y = 0; y < inp_size; ++y) {
+                for (int x = 0; x < inp_size; ++x) {
+                    float dx = x - cx;
+                    float dy = y - cy;
+                    float hue = std::atan2(dy, dx) / (2.0f * 3.14159265f);
+                    if (hue < 0) hue += 1.0f;
+                    float sat = std::sqrt(dx * dx + dy * dy) / max_dist;
+                    if (sat > 1.0f) sat = 1.0f;
+                    float h6 = hue * 6.0f;
+                    int i6 = (int)h6;
+                    float f = h6 - i6;
+                    float p = 1.0f - sat;
+                    float q = 1.0f - sat * f;
+                    float t = 1.0f - sat * (1.0f - f);
+                    float r, g, b;
+                    switch (i6 % 6) {
+                        case 0: r=1; g=t; b=p; break;
+                        case 1: r=q; g=1; b=p; break;
+                        case 2: r=p; g=1; b=t; break;
+                        case 3: r=p; g=q; b=1; break;
+                        case 4: r=t; g=p; b=1; break;
+                        default: r=1; g=p; b=q; break;
+                    }
+                    image[y][x * 3 + 0] = r;
+                    image[y][x * 3 + 1] = g;
+                    image[y][x * 3 + 2] = b;
+                }
+            }
         } else if (input == "one") {
             samples = std::vector<float>(inp_size, 1.0f);
         } else if (input == "zero") {
diff --git a/tools/mtmd/debug/mtmd-debug.md b/tools/mtmd/debug/mtmd-debug.md
index 76ffe5c8451..71bd52dd4b3 100644
--- a/tools/mtmd/debug/mtmd-debug.md
+++ b/tools/mtmd/debug/mtmd-debug.md
@@ -20,6 +20,43 @@ def test_vision():
 test_vision()
 ```
 
+Example of debugging a rainbow image:
+
+```py
+import torch
+import math
+
+def make_rainbow(img_size):
+    cx, cy = img_size / 2.0, img_size / 2.0
+    max_dist = math.sqrt(cx * cx + cy * cy)
+    img = torch.zeros(1, 3, img_size, img_size)
+    for y in range(img_size):
+        for x in range(img_size):
+            dx, dy = x - cx, y - cy
+            hue = math.atan2(dy, dx) / (2 * math.pi)
+            if hue < 0:
+                hue += 1
+            sat = math.sqrt(dx * dx + dy * dy) / max_dist
+            sat = min(sat, 1.0)
+            h6 = hue * 6
+            i6 = int(h6)
+            f = h6 - i6
+            p = 1 - sat
+            q = 1 - sat * f
+            t = 1 - sat * (1 - f)
+            rgb = [(1,t,p),(q,1,p),(p,1,t),(p,q,1),(t,p,1),(1,p,q)][i6 % 6]
+            img[0, 0, y, x] = rgb[0]
+            img[0, 1, y, x] = rgb[1]
+            img[0, 2, y, x] = rgb[2]
+    return img
+
+img_size = 896
+pixel_values = make_rainbow(img_size)
+with torch.no_grad():
+    outputs = model.model.get_image_features(pixel_values=pixel_values)
+print("last_hidden_state:", outputs.last_hidden_state)
+```
+
 ## Debugging preprocess pass
 
 (TODO)
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
index 8419d496a5b..c3c22d0a4ba 100644
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 
             cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
             cur = ggml_add(ctx0, cur, layer.qkv_b);
-            cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
             cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
 
             ggml_tensor * Q;
@@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * sam_out = build_sam(inp_raw);
 
+    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
+
     ggml_tensor * clip_out;
     // Building DS-OCR CLIP
     {
         ggml_tensor * inp;
 
-        inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
-        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
+        inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
         inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
-        ggml_tensor * new_pos_embd =
-            ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
+        ggml_tensor * new_pos_embd = model.position_embeddings;
 
         int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
         const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
@@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
         clip_out = cur;
     }
 
-    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
-
     sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
     sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
     clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
 
     ggml_tensor * cur;
     cur = ggml_concat(ctx0, clip_out, sam_out, 0);
-    cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
-    cur = ggml_cont(ctx0, cur);
     cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
@@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
     const auto n_dim = cur->ne[0];
 
     ggml_tensor * imgnl;
-    ggml_tensor * vs;
 
     imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
     cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
     cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, vs, 1);  // (n_dim, h*(w+1) + 1)
+    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
 
     cb(cur, "dsocr_output", -1);
 
diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp
new file mode 100644
index 00000000000..056bb81807f
--- /dev/null
+++ b/tools/mtmd/models/deepseekocr2.cpp
@@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_deepseekocr2::build() {
+    GGML_ASSERT(hparams.n_head_kv > 0);
+    GGML_ASSERT(n_head % hparams.n_head_kv == 0);
+
+    // patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    ggml_tensor * sam_out = build_sam(inp_raw);
+
+    ggml_tensor * qwen2_out;
+    // Building Qwen2 encoder
+    {
+        ggml_tensor * inp;
+
+        inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+        auto num_image_tokens = inp->ne[1]; // H*W
+        GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
+
+        // query based on numbers of image tokens (in SAM output)
+        // 16x16 -> query_1024 (1024x1024 images)
+        // 12x12 -> query_768 (768x768 images)
+
+        ggml_tensor * query_embed = model.resample_query_1024;
+        int           num_queries = 256;
+
+        if (num_image_tokens == 144) {
+            query_embed = model.resample_query_768;
+            num_queries = 144;
+        }
+
+        // (B, num_image_tokens + num_queries, C)
+        inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
+
+        auto seq_len = inp->ne[1];
+
+        // qwen2 encoder attention mask
+        ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
+        ggml_set_name(attn_mask, "qwen2_attn_mask");
+        ggml_set_input(attn_mask);
+
+        ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
+
+        auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
+            return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
+                                 GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
+        };
+
+        build_vit_opts vit_opts;
+        vit_opts.attn_mask = attn_mask;
+
+        // build_vit applies model.post_ln_w internally; do not re-apply
+        ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
+                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts);
+
+        cur = ggml_cont(ctx0,
+                        ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
+                                     cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
+
+        ggml_build_forward_expand(gf, cur);
+        qwen2_out = cur;
+    }
+
+    ggml_tensor * cur;
+
+    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
+    cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    // view_seperator only after the global view
+    if (img.add_viewsep) {
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
+    }
+
+    cb(cur, "dsocr2_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp
index 4068a08aaf9..3570d6da135 100644
--- a/tools/mtmd/models/gemma4v.cpp
+++ b/tools/mtmd/models/gemma4v.cpp
@@ -124,12 +124,12 @@ ggml_cgraph * clip_graph_gemma4v::build() {
     }
 
     // Gemma4MultimodalEmbedder
-    cur = build_mm(model.mm_input_proj_w, cur);
-    cb(cur, "projected", -1);
-
-    // embedding_post_projection_norm
-    cur = ggml_rms_norm(ctx0, cur, hparams.eps);
-    cb(cur, "projected_normed", -1);
+    {
+        // embedding_pre_projection_norm
+        cur = ggml_rms_norm(ctx0, cur, hparams.eps);
+        cur = build_mm(model.mm_input_proj_w, cur);
+        cb(cur, "projected", -1);
+    }
 
     ggml_build_forward_expand(gf, cur);
     return gf;
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 119c2d541b5..a856882c275 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph {
     ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 
+struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
+    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
+    ggml_cgraph * build() override; // reuses build_sam() from base
+};
+
 struct clip_graph_conformer : clip_graph {
     clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index 37c271d18a8..caf72d53621 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
     return true;
 }
 
+//
+// mtmd_image_preprocessor_deepseekocr2
+//
+
+// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
+// sorted by tile count
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+    std::vector<clip_image_size> ratios;
+    for (int n = min_tiles; n <= max_tiles; n++) {
+        for (int w = 1; w <= n; w++) {
+            for (int h = 1; h <= n; h++) {
+                if (w * h < min_tiles || w * h > max_tiles) {
+                    continue;
+                }
+                bool found = false;
+                for (const auto & r : ratios) {
+                    if (r.width == w && r.height == h) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    ratios.push_back({ w, h });
+                }
+            }
+        }
+    }
+    std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+        return a.width * a.height < b.width * b.height;
+    });
+    return ratios;
+}
+
+// pick the grid whose aspect ratio is closest to the image
+// on a tie, prefer the larger grid when the image fits
+clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
+    float                                aspect_ratio,
+    const std::vector<clip_image_size> & target_ratios,
+    int                                  width,
+    int                                  height) {
+    float           best_ratio_diff = std::numeric_limits<float>::max();
+    clip_image_size best_ratio      = { 1, 1 };
+    const float     area            = static_cast<float>(width * height);
+
+    for (const auto & ratio : target_ratios) {
+        const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+        const float ratio_diff          = std::abs(aspect_ratio - target_aspect_ratio);
+        if (ratio_diff < best_ratio_diff) {
+            best_ratio_diff = ratio_diff;
+            best_ratio      = ratio;
+        } else if (ratio_diff == best_ratio_diff) {
+            const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+            if (area > 0.5f * target_area) {
+                best_ratio = ratio;
+            }
+        }
+    }
+    return best_ratio;
+}
+
+bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // emit 768x768 local tiles when the image is larger than a tile in either
+    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+
+    if (img.nx > tile_size || img.ny > tile_size) {
+        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
+        const auto            target_ratios = get_target_ratios();
+        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+
+        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
+        clip_image_u8 refined;
+        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
+                         RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
+
+        for (int row = 0; row < grid.height; row++) {
+            for (int col = 0; col < grid.width; col++) {
+                clip_image_u8 tile;
+                img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
+                output.entries.push_back(std::move(res));
+            }
+        }
+    }
+
+    // global view: aspect-preserving fit-and-pad to base_size.
+    clip_image_u8 padded;
+    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
+                     PAD_NEAREST, hparams.image_pad_color);
+    clip_image_f32_ptr global(clip_image_f32_init());
+    img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
+    global->add_viewsep = true;
+    output.entries.push_back(std::move(global));
+
+    output.grid_x = 1;
+    output.grid_y = 1;
+    return true;
+}
+
 //
 // mtmd_image_preprocessor_step3vl
 //
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 08129a08ed5..91a5bc253ef 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 
+// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
+// tiles when the image is larger than a tile in either dimension.
+struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
+    static constexpr int base_size = 1024; // global view
+    static constexpr int tile_size = 768;  // local tile
+    static constexpr int min_tiles = 2;
+    static constexpr int max_tiles = 6;
+
+    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+
+private:
+    static std::vector<clip_image_size> get_target_ratios();
+    static clip_image_size              find_closest_aspect_ratio(
+        float                                aspect_ratio,
+        const std::vector<clip_image_size> & target_ratios,
+        int                                  width,
+        int                                  height);
+};
+
 // custom image preprocessing for Step3VL
 // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
 struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 63b7e4d052a..b3401634fd6 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -493,6 +493,11 @@ struct mtmd_context {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
+                {
+                    img_end = "\n"; // prevent empty batch on llama-server
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                } break;
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
                     // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
@@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     if (clip_is_llava(ctx_clip)
         || proj_type == PROJECTOR_TYPE_MINICPMV
         || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL) {
+        || proj_type == PROJECTOR_TYPE_INTERNVL
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
+        // entries may have different token counts
+        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
+        size_t offset = 0;
         for (size_t i = 0; i < entries.size(); i++) {
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
-                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+                ctx->image_embd_v.data() + offset);
+            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
     } else {
         ok = clip_image_batch_encode(
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 5c1980271b8..5f5fef765a6 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -3,7 +3,7 @@
 Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
 image to the actual text in part of that image.
 
-Runs the test image through mtmd-cli, calculates CER and chrF for
+Runs each test image through mtmd-cli, calculates CER and chrF for
 its output, and holds them against the HF model's scores.
 """
 
@@ -12,24 +12,81 @@
 import subprocess
 import sys
 import unicodedata
+from dataclasses import dataclass
 from pathlib import Path
 
 logger = logging.getLogger("deepseek-ocr-test")
 
-DEFAULT_IMAGE = "test-1.jpeg"
-DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
 RUN_TIMEOUT = 300
 
-# DeepSeek-OCR reference scores on the test image.
-# This is the baseline the implementation should keep up with.
-HF_REFERENCE_CER = 0.3030
-HF_REFERENCE_CHRF = 67.52
 
-CER_TOLERANCE = 0.02
-CHRF_TOLERANCE = 2.0
-
-CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
-CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
+@dataclass
+class ModelSpec:
+    key: str
+    label: str
+    model_arg: str
+    mmproj_arg: str
+    model_default: str
+    mmproj_default: str
+
+
+@dataclass
+class TestCase:
+    model_key: str
+    label: str
+    image: str
+    ground_truth: str
+    hf_cer: float
+    hf_chrf: float
+    cer_tol: float
+    chrf_tol: float
+
+    @property
+    def cer_max(self) -> float:
+        return self.hf_cer + self.cer_tol
+
+    @property
+    def chrf_min(self) -> float:
+        return self.hf_chrf - self.chrf_tol
+
+
+MODELS = {
+    "v1": ModelSpec(
+        key="v1", label="DeepSeek-OCR",
+        model_arg="--llama-model", mmproj_arg="--mmproj",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
+    ),
+    "v2": ModelSpec(
+        key="v2", label="DeepSeek-OCR-2",
+        model_arg="--llama-model-2", mmproj_arg="--mmproj-2",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
+    ),
+}
+
+CASES = [
+    TestCase(
+        model_key="v1", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0,
+    ),
+    TestCase(
+        model_key="v2", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 640x488 is below the 768 tiling threshold -- single 1024 global view.
+        # hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad);
+        # the transformers HF processor is *not* the reference -- its pad_to_square
+        # is one pixel off and lands at ~0.69 instead.
+        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
+    ),
+]
+
+
+def arg_dest(flag: str) -> str:
+    return flag.lstrip("-").replace("-", "_")
 
 
 def verdict(ok: bool) -> str:
@@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
         "--temp", "0",
         "--flash-attn", "off",  # match the HF "eager" attention reference
         "--no-warmup",
+        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
+        # Default DRY breakers include "\n", so they are cleared below.
+        "--dry-multiplier", "0.8",
+        "--dry-base", "1.75",
+        "--dry-allowed-length", "2",
+        "--dry-penalty-last-n", "-1",
+        "--dry-sequence-breaker", "none",
     ]
     logger.debug(f"  command: {' '.join(cmd)}")
 
@@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str:
         return f.read().strip()
 
 
-def evaluate(expected: str, ocr_out: str) -> bool:
+def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
     expected = normalize_text(expected)
     ocr_out = normalize_text(ocr_out)
     aligned = locally_align(expected, ocr_out)
@@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool:
     cer = compute_cer(expected, aligned)
     chrf = compute_chrf(expected, aligned)
 
-    cer_pass = cer <= CER_MAX
-    chrf_pass = chrf >= CHRF_MIN
+    cer_pass = cer <= case.cer_max
+    chrf_pass = chrf >= case.chrf_min
     passed = cer_pass and chrf_pass
 
     logger.info("")
     logger.info("=" * 60)
     logger.info("Free OCR evaluation:")
     logger.info("=" * 60)
-    logger.info(f"  CER               {cer:>7.4f}    (<= {CER_MAX:>7.4f}  -> {verdict(cer_pass)})")
-    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (>= {CHRF_MIN:>7.2f}  -> {verdict(chrf_pass)})")
+    logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
+    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
     logger.info(f"  Expected chars    {len(expected):>7}")
     logger.info(f"  Aligned chars     {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
     logger.info("")
@@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool:
 
 def argument_parser() -> argparse.ArgumentParser:
     ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
-    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
-                    help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
-    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
-                    help="Path to mmproj GGUF file (relative to repo root or absolute)")
     ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
                     help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
+    for spec in MODELS.values():
+        ap.add_argument(spec.model_arg, default=spec.model_default,
+                        help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)")
+        ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default,
+                        help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)")
     ap.add_argument("--verbose", action="store_true",
                     help="Also log the expected, OCR, and aligned text")
     return ap
@@ -167,53 +233,60 @@ def main() -> int:
     args = argument_parser().parse_args()
     configure_logging(args.verbose)
 
-    tests_dir = Path(__file__).parent  # tools/mtmd/tests
-    mtmd_dir = tests_dir.parent  # tools/mtmd
-    repo_root = mtmd_dir.parent.parent  # repo root
+    repo_root = Path(__file__).resolve().parents[3]  # tests -> mtmd -> tools -> repo root
+    binary = resolve_path(args.llama_bin, repo_root)
 
-    inputs = [
-        ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
-        ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
-        ("model", resolve_path(args.llama_model, repo_root)),
-        ("mmproj", resolve_path(args.mmproj, repo_root)),
-        ("binary", resolve_path(args.llama_bin, repo_root)),
-    ]
-    for label, path in inputs:
-        if not path.exists():
-            logger.error(f"Error: {label} not found: {path}")
-            return 1
-    paths = dict(inputs)
+    if not binary.exists():
+        logger.error(f"Error: binary not found: {binary}")
+        return 1
 
     logger.info("=" * 60)
-    logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
+    logger.info("DeepSeek-OCR: llama.cpp vs HF parity check")
     logger.info("=" * 60)
-    logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
-    logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
 
-    logger.debug("")
-    logger.debug("Resolved test inputs:")
-    for label, path in inputs:
-        logger.debug(f"  {label:<14} {path}")
-
-    logger.info("")
-    logger.info("[1/3] Running llama.cpp 'Free OCR'")
-    try:
-        ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
-                               paths["image"], paths["binary"])
-    except RuntimeError as e:
-        logger.error(f"Error: {e}")
-        return 1
-
-    logger.info("")
-    logger.info("[2/3] Reading expected output")
-    expected = read_expected_text(paths["expected-text"])
-    logger.info(f"  expected: {len(expected)} chars")
+    results = {}
+    for case in CASES:
+        model_spec = MODELS[case.model_key]
+        title = f"{model_spec.label} -- {case.label}"
+
+        logger.info("")
+        logger.info(f"=== {title} ===")
+
+        model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root)
+        mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root)
+        image = resolve_path(case.image, repo_root)
+        ground_truth = resolve_path(case.ground_truth, repo_root)
+
+        missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj),
+                                           ("image", image), ("ground-truth", ground_truth)]
+                   if not p.exists()]
+        if missing:
+            for lbl, p in missing:
+                logger.error(f"  Error: {lbl} not found: {p}")
+            results[title] = False
+            continue
+
+        expected = read_expected_text(ground_truth)
+        logger.info(f"  Image: {case.image}")
+        logger.info(f"  Expected text: {len(expected)} chars")
+        logger.info("  Running llama.cpp 'Free OCR'")
+        try:
+            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+        except RuntimeError as e:
+            logger.error(f"  Error: {e}")
+            results[title] = False
+            continue
+
+        results[title] = evaluate(case, expected, ocr_out)
 
     logger.info("")
-    logger.info("[3/3] Computing OCR metrics")
-    ok = evaluate(expected, ocr_out)
+    logger.info("=== Summary ===")
+    for title, ok in results.items():
+        logger.info(f"  {title:<48} {verdict(ok)}")
+    all_passed = all(results.values())
+    logger.info(f"Overall: {verdict(all_passed)}")
 
-    return 0 if ok else 1
+    return 0 if all_passed else 1
 
 
 if __name__ == "__main__":
diff --git a/tools/server/README.md b/tools/server/README.md
index 0d20ced879f..df30ca64649 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -33,7 +33,6 @@ For the full list of features, please refer to [server's changelog](https://gith
 | -------- | ----------- |
 | `-h, --help, --usage` | print usage and exit |
 | `--version` | show version and build info |
-| `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
@@ -176,6 +175,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
+| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
 | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
 | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
@@ -201,11 +202,11 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -223,8 +224,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
@@ -1662,23 +1663,30 @@ Listing all models in cache. The model metadata will also include a field to ind
 {
   "data": [{
     "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
-    "in_cache": true,
     "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
     "status": {
       "value": "loaded",
       "args": ["llama-server", "-ctx", "4096"]
     },
+    "architecture": {
+      "input_modalities": [
+        "text",
+        "image"
+      ],
+      "output_modalities": [
+        "text"
+      ]
+    },
     ...
   }]
 }
 ```
 
 Note:
-1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
-2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
+1. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
     - If a model is running but updated or removed from the source, it will be unloaded
     - If a model is not running, it will be added or updated according to the source
-3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
+2. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance.
 
 The `status` object can be:
 
diff --git a/tools/server/bench/speed-bench/README.md b/tools/server/bench/speed-bench/README.md
new file mode 100644
index 00000000000..8d3fcd804c4
--- /dev/null
+++ b/tools/server/bench/speed-bench/README.md
@@ -0,0 +1,117 @@
+# SPEED-Bench server benchmark
+
+A lightweight [SPEED-Bench](https://huggingface.co/datasets/nvidia/SPEED-Bench) client for benchmarking an already-running `llama-server` through its OpenAI-compatible API. It is primarily meant to evaluate speculative decoding (draft model, n-gram, MTP, EAGLE3, ...) by reporting per-category throughput, latency, and draft acceptance.
+
+The dataset handling follows the [aiperf SPEED-Bench tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md), which also documents the dataset layout in more detail.
+
+## Install
+
+```bash
+pip install -r tools/server/bench/speed-bench/requirements.txt
+```
+
+## Start a server
+
+The client does not launch the server, so start `llama-server` yourself first. If you care about throughput numbers, set the client `--concurrency` to the server's slot count (`--np`):
+
+```bash
+llama-server \
+  -m target.gguf \
+  -c 8192 \
+  --port 8080 \
+  -ngl 99 -fa on \
+  --np 1 \
+  --jinja
+```
+
+For speculative decoding, start the server with the appropriate flags for your setup (e.g. a draft model with `-md`, or `--spec-type ngram-mod`). See the [speculative decoding doc](../../../../docs/speculative.md) for details.
+
+## Run
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category coding \
+  --osl 1024 \
+  --concurrency 1
+```
+
+## Options
+
+| Option | Default | Description |
+| --- | --- | --- |
+| `--url` | `localhost:8080` | Server URL. The scheme and `/v1` are optional and a trailing slash is fine, so `localhost:8080` and `http://localhost:8080/v1/` both work. |
+| `--model` | none | Optional `model` field sent in each request. |
+| `--bench` | `qualitative` | SPEED-Bench config, e.g. `qualitative`, `throughput_1k`. See [available dataset variants](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md#available-dataset-variants). |
+| `--category` | `all` | Category filter within the bench; comma-separated list or `all`. For `qualitative` the categories are `coding`, `humanities`, `math`, `multilingual`, `qa`, `rag`, `reasoning`, `roleplay`, `stem`, `summarization`, `writing`. For the `throughput_{ISL}` splits they are `high_entropy`, `low_entropy`, `mixed`. |
+| `--osl` | `1024` | Output sequence length, mapped to `max_tokens`. |
+| `--extra-inputs` | `{"temperature":0}` | Extra request fields as a JSON object. |
+| `--concurrency` | `1` | Concurrent client requests; usually match `--np`. |
+| `--limit` | none | Max samples per category (handy for smoke tests). |
+| `--timeout` | `600` | Per-request timeout in seconds. |
+| `--output` | none | Save raw per-request results and the summary to JSON. |
+
+A few common ones:
+
+- `--category all` runs every category in the bench.
+- `--category coding,math` runs just those two.
+- `--bench throughput_8k` runs a fixed-input-length throughput split.
+- `--limit 8` keeps at most 8 samples per category, which is enough for a quick check.
+
+The `throughput_{ISL}` splits use fixed input lengths (1k - 32k), so they are handy for long-context testing and for comparing different `llama-server` batching settings (e.g. sweeping `-ub` / `--ubatch-size`) on prompts of a known size. Make sure the server `-c` is large enough for the chosen split. When raising `-ub`, also raise `-b` to at least the same value, since the physical ubatch cannot exceed the logical batch.
+
+When `--output` is given, the JSON file holds the run `config`, the `selected_samples` / `completed_samples` / `failed_samples` counts, the per-category `summary` rows, and the per-sample `results`.
+
+## Metrics
+
+The summary prints one row per category plus an `overall` row:
+
+- `samples` - how many samples finished successfully.
+- `avg_prompt_t/s` - prefill throughput from llama.cpp (`timings.prompt_per_second`), averaged over the category's samples.
+- `avg_pred_t/s` - decode throughput from llama.cpp (`timings.predicted_per_second`), averaged over the category's samples.
+- `avg_latency` - average end-to-end request latency seen by the client.
+- `accept_rate` - `accepted / draft_n` over the category, or `n/a` if nothing was drafted (`draft_n == 0`).
+
+## Baseline vs speculative decoding
+
+Save a run from each server with `--output`, then diff the two JSON files with `speed_bench_compare.py`.
+
+First, start a plain `llama-server` (no speculative decoding) and save a baseline:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category all \
+  --osl 1024 \
+  --concurrency 1 \
+  --output baseline.json
+```
+
+Then restart `llama-server` with speculative decoding enabled and save another run:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench.py \
+  --url localhost:8080 \
+  --bench qualitative \
+  --category all \
+  --osl 1024 \
+  --concurrency 1 \
+  --output spec.json
+```
+
+Finally compare the two:
+
+```bash
+python tools/server/bench/speed-bench/speed_bench_compare.py \
+  --baseline baseline.json \
+  --speculative spec.json
+```
+
+The comparison table adds:
+
+- `decode_speedup = spec_avg_pred_t/s / base_avg_pred_t/s`
+- `latency_speedup = base_avg_latency / spec_avg_latency`
+
+Keep `--bench`, `--category`, `--osl`, and `--limit` the same across both runs, otherwise they won't be using the same prompts.
diff --git a/tools/server/bench/speed-bench/requirements.txt b/tools/server/bench/speed-bench/requirements.txt
new file mode 100644
index 00000000000..a524c2f5193
--- /dev/null
+++ b/tools/server/bench/speed-bench/requirements.txt
@@ -0,0 +1,3 @@
+datasets
+requests
+tqdm
diff --git a/tools/server/bench/speed-bench/speed_bench.py b/tools/server/bench/speed-bench/speed_bench.py
new file mode 100644
index 00000000000..adb378a6bf0
--- /dev/null
+++ b/tools/server/bench/speed-bench/speed_bench.py
@@ -0,0 +1,432 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import statistics
+import sys
+import time
+from dataclasses import asdict, dataclass
+from typing import Any
+from urllib.parse import urlparse
+
+import requests
+from datasets import get_dataset_config_names, load_dataset
+from tqdm import tqdm
+
+
+DATASET_REPO = "nvidia/SPEED-Bench"
+
+@dataclass
+class Sample:
+    id: str
+    category: str
+    turns: list[str]
+
+
+@dataclass
+class RequestResult:
+    id: str
+    category: str
+    ok: bool
+    turns: int
+    latency_s: float
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    finish_reason: str | None
+    draft_n: int
+    draft_n_accepted: int
+    prompt_ms: float | None
+    predicted_ms: float | None
+    prompt_per_second: float | None
+    predicted_per_second: float | None
+    error: str | None
+
+
+def normalize_base_url(url: str) -> str:
+    url = url.strip().rstrip("/")
+    if not url:
+        raise ValueError("--url cannot be empty")
+    if "://" not in url:
+        url = "http://" + url
+    parsed = urlparse(url)
+    if not parsed.scheme or not parsed.netloc:
+        raise ValueError(f"invalid --url: {url}")
+    if not parsed.path.rstrip("/").endswith("/v1"):
+        url = url + "/v1"
+    return url.rstrip("/")
+
+
+def parse_extra_inputs(value: str) -> dict[str, Any]:
+    extra = json.loads(value)
+    if not isinstance(extra, dict):
+        raise ValueError("--extra-inputs must be a JSON object")
+    return extra
+
+
+def extract_turns(row: dict[str, Any]) -> list[str]:
+    turns = row.get("turns")
+    if isinstance(turns, list) and turns:
+        clean_turns = [str(turn).strip() for turn in turns if turn and str(turn).strip()]
+        if clean_turns:
+            return clean_turns
+    raise ValueError("missing or empty turns")
+
+
+def load_samples(args: argparse.Namespace) -> list[Sample]:
+    bench_names = get_dataset_config_names(DATASET_REPO)
+    if args.bench not in bench_names:
+        raise ValueError(
+            f"unknown --bench {args.bench!r}; available benches: {', '.join(bench_names)}"
+        )
+
+    dataset = load_dataset(DATASET_REPO, name=args.bench, split="test")
+    categories = list(dict.fromkeys(str(category) for category in dataset["category"]))
+    requested_categories = None
+    if args.category != "all":
+        requested_list = [category.strip() for category in args.category.split(",") if category.strip()]
+        if not requested_list:
+            raise ValueError(
+                f"--category must be 'all' or a comma-separated list; available categories: {', '.join(categories)}"
+            )
+        requested_categories = set(requested_list)
+        unknown_categories = [category for category in requested_list if category not in categories]
+        if unknown_categories:
+            unknown = ", ".join(unknown_categories)
+            raise ValueError(
+                f"unknown --category {unknown!r} for bench {args.bench!r}; "
+                f"available categories: all, {', '.join(categories)}"
+            )
+
+    samples: list[Sample] = []
+    samples_per_category: dict[str, int] = {}
+    skipped = 0
+    for index, row_raw in enumerate(dataset):
+        row = dict(row_raw)
+        category_raw = row.get("category")
+        if not isinstance(category_raw, str) or not category_raw.strip():
+            skipped += 1
+            continue
+        category = category_raw.strip()
+        if requested_categories is not None and category not in requested_categories:
+            continue
+        if args.limit is not None and samples_per_category.get(category, 0) >= args.limit:
+            continue
+
+        try:
+            turns = extract_turns(row)
+        except ValueError:
+            skipped += 1
+            continue
+        question_id = row.get("question_id")
+        if not isinstance(question_id, str) or not question_id.strip():
+            skipped += 1
+            continue
+        sample_id = question_id.strip()
+        samples.append(Sample(id=sample_id, category=category, turns=turns))
+        samples_per_category[category] = samples_per_category.get(category, 0) + 1
+
+    if not samples:
+        raise RuntimeError(f"no samples selected from bench={args.bench} category={args.category}")
+
+    if skipped:
+        print(f"speed_bench: skipped {skipped} rows without usable turns")
+    return samples
+
+
+def parse_completion_response(data: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any], str | None, str]:
+    usage = data.get("usage") or {}
+    timings = data.get("timings") or {}
+    finish_reason = None
+    content = ""
+    choices = data.get("choices")
+    if isinstance(choices, list) and choices and isinstance(choices[0], dict):
+        choice = choices[0]
+        finish_reason = choice.get("finish_reason")
+        message = choice.get("message")
+        if isinstance(message, dict) and isinstance(message.get("content"), str):
+            content = message["content"]
+        elif isinstance(choice.get("text"), str):
+            content = choice["text"]
+    return usage, timings, finish_reason, content
+
+
+def run_request(
+    endpoint: str,
+    model: str | None,
+    messages: list[dict[str, str]],
+    osl: int,
+    extra_inputs: dict[str, Any],
+    timeout: float,
+) -> tuple[dict[str, Any], float]:
+    payload: dict[str, Any] = {
+        "messages": messages,
+        "max_tokens": osl,
+        "stream": False,
+    }
+    if model:
+        payload["model"] = model
+    payload.update(extra_inputs)
+    payload["max_tokens"] = osl
+
+    start = time.perf_counter()
+    response = requests.post(endpoint, json=payload, timeout=timeout)
+    latency_s = time.perf_counter() - start
+    if response.status_code != 200:
+        body = response.text[:500].replace("\n", "\\n")
+        raise RuntimeError(f"HTTP {response.status_code}: {body}")
+    return response.json(), latency_s
+
+
+def run_one(
+    sample: Sample,
+    endpoint: str,
+    model: str | None,
+    osl: int,
+    extra_inputs: dict[str, Any],
+    timeout: float,
+) -> RequestResult:
+    selected_turns = sample.turns
+    messages: list[dict[str, str]] = []
+    total_latency_s = 0.0
+    prompt_tokens = 0
+    completion_tokens = 0
+    total_tokens = 0
+    draft_n = 0
+    draft_n_accepted = 0
+    prompt_ms = 0.0
+    predicted_ms = 0.0
+    prompt_per_second = None
+    predicted_per_second = None
+    finish_reason: str | None = None
+    try:
+        for turn in selected_turns:
+            messages.append({"role": "user", "content": turn})
+            data, latency_s = run_request(endpoint, model, messages, osl, extra_inputs, timeout)
+            total_latency_s += latency_s
+            usage, timings, finish_reason, assistant_text = parse_completion_response(data)
+
+            turn_prompt_tokens = int(usage.get("prompt_tokens") or timings.get("prompt_n") or 0)
+            turn_completion_tokens_count = int(usage.get("completion_tokens") or timings.get("predicted_n") or 0)
+            turn_total_tokens_count = int(usage.get("total_tokens") or (turn_prompt_tokens + turn_completion_tokens_count))
+            prompt_tokens += turn_prompt_tokens
+            completion_tokens += turn_completion_tokens_count
+            total_tokens += turn_total_tokens_count
+            draft_n += int(timings.get("draft_n") or 0)
+            draft_n_accepted += int(timings.get("draft_n_accepted") or 0)
+            prompt_ms += float(timings.get("prompt_ms") or 0)
+            predicted_ms += float(timings.get("predicted_ms") or 0)
+            if len(selected_turns) == 1 and isinstance(timings.get("prompt_per_second"), (int, float)):
+                prompt_per_second = float(timings["prompt_per_second"])
+            if len(selected_turns) == 1 and isinstance(timings.get("predicted_per_second"), (int, float)):
+                predicted_per_second = float(timings["predicted_per_second"])
+
+            messages.append({"role": "assistant", "content": assistant_text})
+
+        if total_tokens == 0:
+            total_tokens = prompt_tokens + completion_tokens
+        if len(selected_turns) > 1:
+            prompt_per_second = (prompt_tokens / (prompt_ms / 1000)) if prompt_ms > 0 else None
+            predicted_per_second = (completion_tokens / (predicted_ms / 1000)) if predicted_ms > 0 else None
+
+        return RequestResult(
+            id=sample.id,
+            category=sample.category,
+            ok=True,
+            turns=len(selected_turns),
+            latency_s=total_latency_s,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            finish_reason=finish_reason,
+            draft_n=draft_n,
+            draft_n_accepted=draft_n_accepted,
+            prompt_ms=prompt_ms if prompt_ms > 0 else None,
+            predicted_ms=predicted_ms if predicted_ms > 0 else None,
+            prompt_per_second=prompt_per_second,
+            predicted_per_second=predicted_per_second,
+            error=None,
+        )
+    except Exception as exc:
+        return RequestResult(
+            id=sample.id,
+            category=sample.category,
+            ok=False,
+            turns=len(selected_turns),
+            latency_s=total_latency_s,
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+            finish_reason=None,
+            draft_n=0,
+            draft_n_accepted=0,
+            prompt_ms=None,
+            predicted_ms=None,
+            prompt_per_second=None,
+            predicted_per_second=None,
+            error=str(exc),
+        )
+
+
+def summarize_group(category: str, results: list[RequestResult]) -> dict[str, Any]:
+    ok_results = [result for result in results if result.ok]
+    latencies = [result.latency_s for result in ok_results]
+    server_prompt_speeds = [
+        result.prompt_per_second
+        for result in ok_results
+        if result.prompt_per_second is not None
+    ]
+    server_completion_speeds = [
+        result.predicted_per_second
+        for result in ok_results
+        if result.predicted_per_second is not None
+    ]
+    turns = sum(result.turns for result in ok_results)
+    draft_n = sum(result.draft_n for result in ok_results)
+    accepted = sum(result.draft_n_accepted for result in ok_results)
+
+    return {
+        "category": category,
+        "requests": len(ok_results),
+        "turns": turns,
+        "failed": len(results) - len(ok_results),
+        "avg_prompt_t_s": statistics.mean(server_prompt_speeds) if server_prompt_speeds else None,
+        "avg_pred_t_s": statistics.mean(server_completion_speeds) if server_completion_speeds else None,
+        "avg_latency": statistics.mean(latencies) if latencies else None,
+        "draft_n": draft_n,
+        "accepted": accepted,
+        "accept_rate": (accepted / draft_n) if draft_n > 0 else None,
+    }
+
+
+def fmt_value(value: Any, kind: str = "") -> str:
+    if value is None:
+        return "n/a"
+    if kind == "int":
+        return str(int(value))
+    if kind == "rate":
+        return f"{float(value):.4f}"
+    if kind == "seconds":
+        return f"{float(value):.3f}s"
+    if kind == "speed":
+        return f"{float(value):.2f}"
+    if kind == "speedup":
+        return f"{float(value):.2f}x"
+    return str(value)
+
+
+def print_table(rows: list[dict[str, Any]]) -> None:
+    columns = [
+        ("category", "category", ""),
+        ("samples", "requests", "int"),
+        ("avg_prompt_t/s", "avg_prompt_t_s", "speed"),
+        ("avg_pred_t/s", "avg_pred_t_s", "speed"),
+        ("avg_latency", "avg_latency", "seconds"),
+        ("accept_rate", "accept_rate", "rate"),
+    ]
+    print_rows(rows, columns)
+
+
+def print_rows(rows: list[dict[str, Any]], columns: list[tuple[str, str, str]]) -> None:
+    rendered_rows = []
+    for row in rows:
+        rendered_rows.append([fmt_value(row.get(key), kind) for _, key, kind in columns])
+
+    widths = [len(header) for header, _, _ in columns]
+    for rendered in rendered_rows:
+        for i, cell in enumerate(rendered):
+            widths[i] = max(widths[i], len(cell))
+
+    header = "  ".join(header.ljust(widths[i]) for i, (header, _, _) in enumerate(columns))
+    print(header)
+    print("  ".join("-" * width for width in widths))
+    for rendered in rendered_rows:
+        print("  ".join(cell.ljust(widths[i]) for i, cell in enumerate(rendered)))
+
+
+def save_output(path: str, args: argparse.Namespace, samples: list[Sample], results: list[RequestResult], summary: list[dict[str, Any]]) -> None:
+    payload = {
+        "config": {
+            "url": args.url,
+            "model": args.model,
+            "bench": args.bench,
+            "category": args.category,
+            "osl": args.osl,
+            "concurrency": args.concurrency,
+            "extra_inputs": args.extra_inputs,
+        },
+        "selected_samples": len(samples),
+        "completed_samples": sum(1 for result in results if result.ok),
+        "failed_samples": sum(1 for result in results if not result.ok),
+        "summary": summary,
+        "results": [asdict(result) for result in results],
+    }
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2, sort_keys=True)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Run SPEED-Bench against an OpenAI-compatible llama-server.")
+    parser.add_argument("--url", default="localhost:8080", help="Server URL, for example localhost:8080 or http://localhost:8080/v1")
+    parser.add_argument("--model", default=None, help="Optional model name to send in OpenAI requests")
+    parser.add_argument("--bench", default="qualitative", help="SPEED-Bench config to run, for example qualitative or throughput_1k")
+    parser.add_argument("--category", default="all", help="Category to run within the selected bench; use all for no category filter")
+    parser.add_argument("--osl", type=int, default=4096, help="Output sequence length, mapped to max_tokens")
+    parser.add_argument("--extra-inputs", default='{"temperature":0}', help="Extra request fields as a JSON object")
+    parser.add_argument("--concurrency", type=int, default=1, help="Concurrent client requests; usually match llama-server --np")
+    parser.add_argument("--limit", type=int, default=None, help="Optional sample limit per category for smoke tests")
+    parser.add_argument("--timeout", type=float, default=600, help="Per-request timeout in seconds")
+    parser.add_argument("--output", default=None, help="Optional path to save raw results JSON")
+    args = parser.parse_args(argv)
+    try:
+        base_url = normalize_base_url(args.url)
+        endpoint = base_url + "/chat/completions"
+        extra_inputs = parse_extra_inputs(args.extra_inputs)
+        args.extra_inputs = extra_inputs
+        samples = load_samples(args)
+    except Exception as exc:
+        print(f"speed_bench: setup failed: {exc}", file=sys.stderr)
+        return 2
+
+    print(f"speed_bench: loaded {len(samples)} samples from bench={args.bench} category={args.category}")
+
+    results: list[RequestResult] = []
+    started = time.perf_counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor:
+        futures = [
+            executor.submit(run_one, sample, endpoint, args.model, args.osl, extra_inputs, args.timeout)
+            for sample in samples
+        ]
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="speed_bench", unit="sample"):
+            result = future.result()
+            results.append(result)
+
+    elapsed = time.perf_counter() - started
+    categories = list(dict.fromkeys(sample.category for sample in samples))
+    summary = [
+        summarize_group(category, [result for result in results if result.category == category])
+        for category in categories
+    ]
+    summary.append(summarize_group("overall", results))
+    print()
+    print(f"Summary (elapsed={elapsed:.2f}s)")
+    print_table(summary)
+
+    if args.output:
+        save_output(args.output, args, samples, results, summary)
+        print(f"\nspeed_bench: wrote {args.output}")
+
+    failed = sum(1 for result in results if not result.ok)
+    if failed:
+        print(f"\nspeed_bench: {failed} samples failed", file=sys.stderr)
+        first_error = next((result.error for result in results if result.error), None)
+        if first_error:
+            print(f"first error: {first_error}", file=sys.stderr)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/server/bench/speed-bench/speed_bench_compare.py b/tools/server/bench/speed-bench/speed_bench_compare.py
new file mode 100644
index 00000000000..070ab57db5d
--- /dev/null
+++ b/tools/server/bench/speed-bench/speed_bench_compare.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from typing import Any
+
+from speed_bench import fmt_value, print_rows
+
+
+def load_summary(path: str) -> list[dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    summary = data.get("summary")
+    if not isinstance(summary, list):
+        raise ValueError(f"{path} does not contain a summary list")
+    return summary
+
+
+def compare_rows(baseline: list[dict[str, Any]], speculative: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    baseline_by_category = {row["category"]: row for row in baseline}
+    comparisons = []
+    for row in speculative:
+        base = baseline_by_category.get(row["category"])
+        if not base:
+            continue
+        base_speed = base.get("avg_pred_t_s")
+        spec_speed = row.get("avg_pred_t_s")
+        base_latency = base.get("avg_latency")
+        spec_latency = row.get("avg_latency")
+        comparisons.append(
+            {
+                "category": row["category"],
+                "base_avg_pred_t_s": base_speed,
+                "spec_avg_pred_t_s": spec_speed,
+                "decode_speedup": (spec_speed / base_speed) if base_speed and spec_speed else None,
+                "base_avg_latency": base_latency,
+                "spec_avg_latency": spec_latency,
+                "latency_speedup": (base_latency / spec_latency) if base_latency and spec_latency else None,
+                "accept_rate": row.get("accept_rate"),
+            }
+        )
+    return comparisons
+
+
+def print_comparison(rows: list[dict[str, Any]]) -> None:
+    if not rows:
+        print("No overlapping categories found for comparison.")
+        return
+    columns = [
+        ("category", "category", ""),
+        ("base_avg_pred_t/s", "base_avg_pred_t_s", "speed"),
+        ("spec_avg_pred_t/s", "spec_avg_pred_t_s", "speed"),
+        ("decode_speedup", "decode_speedup", "speedup"),
+        ("base_avg_latency", "base_avg_latency", "seconds"),
+        ("spec_avg_latency", "spec_avg_latency", "seconds"),
+        ("latency_speedup", "latency_speedup", "speedup"),
+        ("accept_rate", "accept_rate", "rate"),
+    ]
+    print_rows(rows, columns)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Compare two SPEED-Bench runs (baseline vs speculative).")
+    parser.add_argument("--baseline", required=True, help="Baseline results JSON produced by speed_bench.py --output")
+    parser.add_argument("--speculative", required=True, help="Speculative decoding results JSON produced by speed_bench.py --output")
+    args = parser.parse_args(argv)
+
+    try:
+        baseline = load_summary(args.baseline)
+        speculative = load_summary(args.speculative)
+    except Exception as exc:
+        print(f"speed_bench_compare: failed to load inputs: {exc}", file=sys.stderr)
+        return 2
+
+    comparisons = compare_rows(baseline, speculative)
+    print(f"Comparison: baseline={args.baseline} speculative={args.speculative}")
+    print_comparison(comparisons)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/server/chat-llama2.sh b/tools/server/chat-llama2.sh
deleted file mode 100755
index 450445f17e3..00000000000
--- a/tools/server/chat-llama2.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env bash
-
-API_URL="${API_URL:-http://127.0.0.1:8080}"
-
-CHAT=(
-    "Hello, Assistant."
-    "Hello. How may I help you today?"
-)
-
-INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
-
-trim() {
-    shopt -s extglob
-    set -- "${1##+([[:space:]])}"
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-trim_trailing() {
-    shopt -s extglob
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-format_prompt() {
-    if [[ "${#CHAT[@]}" -eq 0 ]]; then
-        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
-    else
-        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
-        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
-    fi
-}
-
-tokenize() {
-    curl \
-        --silent \
-        --request POST \
-        --url "${API_URL}/tokenize" \
-        --header "Content-Type: application/json" \
-        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
-    | jq '.tokens[]'
-}
-
-N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
-
-chat_completion() {
-    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
-    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
-        prompt: .,
-        temperature: 0.2,
-        top_k: 40,
-        top_p: 0.9,
-        n_keep: $n_keep,
-        n_predict: 1024,
-        stop: ["[INST]"],
-        stream: true
-    }')"
-
-    # Create a temporary file to hold the Python output
-    TEMPFILE=$(mktemp)
-
-    exec 3< <(curl \
-        --silent \
-        --no-buffer \
-        --request POST \
-        --url "${API_URL}/completion" \
-        --header "Content-Type: application/json" \
-        --data-raw "${DATA}")
-
-    python -c "
-import json
-import sys
-
-answer = ''
-while True:
-    line = sys.stdin.readline()
-    if not line:
-        break
-    if line.startswith('data: '):
-        json_content = line[6:].strip()
-        content = json.loads(json_content)['content']
-        sys.stdout.write(content)
-        sys.stdout.flush()
-        answer += content
-
-answer = answer.rstrip('\n')
-
-# Write the answer to the temporary file
-with open('$TEMPFILE', 'w') as f:
-    f.write(answer)
-    " <&3
-
-    exec 3<&-
-
-    # Read the answer from the temporary file
-    ANSWER=$(cat $TEMPFILE)
-
-    # Clean up the temporary file
-    rm $TEMPFILE
-
-    printf "\n"
-
-    CHAT+=("$1" "$(trim "$ANSWER")")
-}
-
-while true; do
-    echo -en "\033[0;32m"  # Green color
-    read -r -e -p "> " QUESTION
-    echo -en "\033[0m"  # Reset color
-    chat_completion "${QUESTION}"
-done
diff --git a/tools/server/chat.mjs b/tools/server/chat.mjs
deleted file mode 100644
index 4fef5655a89..00000000000
--- a/tools/server/chat.mjs
+++ /dev/null
@@ -1,131 +0,0 @@
-import * as readline from 'node:readline'
-import { stdin, stdout } from 'node:process'
-import { readFileSync } from 'node:fs'
-import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
-
-const args = process.argv.slice(2);
-const grammarJsonSchemaFile = args.find(
-    (_, index) => args[index - 1] === "--grammar-json-schema"
-);
-
-const no_cached_prompt = args.find(
-    (_, index) => args[index - 1] === "--no-cache-prompt"
-) ?? "false";
-
-const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
-
-// Example usage: function,arguments
-const grammarJsonSchemaPropOrder = args.find(
-    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
-);
-const propOrder = grammarJsonSchemaPropOrder
-    ? grammarJsonSchemaPropOrder
-          .split(",")
-          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
-    : {};
-
-let grammar = null
-if (grammarJsonSchemaFile) {
-    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
-    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
-    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
-    converter.visit(schema, '')
-    grammar = converter.formatGrammar()
-}
-if (grammarFile) {
-    grammar = readFileSync(grammarFile, 'utf-8')
-}
-
-// for cached prompt
-let slot_id = -1;
-
-const API_URL = 'http://127.0.0.1:8080'
-
-const chat = [
-    {
-        human: "Hello, Assistant.",
-        assistant: "Hello. How may I help you today?"
-    },
-    {
-        human: "Please tell me the largest city in Europe.",
-        assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
-    },
-]
-
-const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
-
-function format_prompt(question) {
-    return `${instruction}\n${
-        chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
-    }\n### Human: ${question}\n### Assistant:`
-}
-
-async function tokenize(content) {
-    const result = await fetch(`${API_URL}/tokenize`, {
-        method: 'POST',
-        body: JSON.stringify({ content })
-    })
-
-    if (!result.ok) {
-        return []
-    }
-
-    return await result.json().tokens
-}
-
-const n_keep = await tokenize(instruction).length
-
-async function chat_completion(question) {
-    const result = await fetch(`${API_URL}/completion`, {
-        method: 'POST',
-        body: JSON.stringify({
-            prompt: format_prompt(question),
-            temperature: 0.2,
-            top_k: 40,
-            top_p: 0.9,
-            n_keep: n_keep,
-            n_predict: 256,
-            cache_prompt: no_cached_prompt === "false",
-            slot_id: slot_id,
-            stop: ["\n### Human:"], // stop completion after generating this
-            grammar,
-            stream: true,
-        })
-    })
-
-    if (!result.ok) {
-        return
-    }
-
-    let answer = ''
-
-    for await (var chunk of result.body) {
-        const t = Buffer.from(chunk).toString('utf8')
-        if (t.startsWith('data: ')) {
-            const message = JSON.parse(t.substring(6))
-            slot_id = message.slot_id
-            answer += message.content
-            process.stdout.write(message.content)
-            if (message.stop) {
-                if (message.truncated) {
-                    chat.shift()
-                }
-                break
-            }
-        }
-    }
-
-    process.stdout.write('\n')
-    chat.push({ human: question, assistant: answer.trimStart() })
-}
-
-const rl = readline.createInterface({ input: stdin, output: stdout });
-
-const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
-    rl.question(query, options, resolve)
-});
-
-while(true) {
-    const question = await readlineQuestion(rl, '> ')
-    await chat_completion(question)
-}
diff --git a/tools/server/chat.sh b/tools/server/chat.sh
deleted file mode 100755
index 84cea2d56a0..00000000000
--- a/tools/server/chat.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env bash
-
-API_URL="${API_URL:-http://127.0.0.1:8080}"
-
-CHAT=(
-    "Hello, Assistant."
-    "Hello. How may I help you today?"
-    "Please tell me the largest city in Europe."
-    "Sure. The largest city in Europe is Moscow, the capital of Russia."
-)
-
-INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
-
-trim() {
-    shopt -s extglob
-    set -- "${1##+([[:space:]])}"
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-trim_trailing() {
-    shopt -s extglob
-    printf "%s" "${1%%+([[:space:]])}"
-}
-
-format_prompt() {
-    echo -n "${INSTRUCTION}"
-    printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
-}
-
-tokenize() {
-    curl \
-        --silent \
-        --request POST \
-        --url "${API_URL}/tokenize" \
-        --header "Content-Type: application/json" \
-        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
-    | jq '.tokens[]'
-}
-
-N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
-
-chat_completion() {
-    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
-    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
-        prompt: .,
-        temperature: 0.2,
-        top_k: 40,
-        top_p: 0.9,
-        n_keep: $n_keep,
-        n_predict: 256,
-        cache_prompt: true,
-        stop: ["\n### Human:"],
-        stream: true
-    }')"
-
-    ANSWER=''
-
-    while IFS= read -r LINE; do
-        if [[ $LINE = data:* ]]; then
-            CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
-            printf "%s" "${CONTENT}"
-            ANSWER+="${CONTENT}"
-        fi
-    done < <(curl \
-        --silent \
-        --no-buffer \
-        --request POST \
-        --url "${API_URL}/completion" \
-        --header "Content-Type: application/json" \
-        --data-raw "${DATA}")
-
-    printf "\n"
-
-    CHAT+=("$1" "$(trim "$ANSWER")")
-}
-
-while true; do
-    read -r -e -p "> " QUESTION
-    chat_completion "${QUESTION}"
-done
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ae9e0bf60d8..bfe3443c1de 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1734,7 +1734,7 @@ struct server_context_impl {
         return true;
     }
 
-    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress, bool is_begin = false) {
         auto res = std::make_unique<server_task_result_cmpl_partial>();
 
         res->id    = slot.task->id;
@@ -1746,6 +1746,9 @@ struct server_context_impl {
             res->progress.cache     = slot.n_prompt_tokens_cache;
             res->progress.processed = slot.prompt.tokens.size();
             res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
+        }
+        if (is_begin) {
+            res->is_begin = true;
         } else {
             res->content = tkn.text_to_send;
             res->tokens  = { tkn.tok };
@@ -2828,10 +2831,15 @@ struct server_context_impl {
 
                         slot.prompt.tokens.keep_first(n_past);
 
-                        // send initial 0% progress update if needed
                         // this is to signal the client that the request has started processing
-                        if (slot.task->params.stream && slot.task->params.return_progress) {
-                            send_partial_response(slot, {}, true);
+                        if (slot.task->params.stream) {
+                            if (slot.task->params.return_progress) {
+                                // send initial 0% progress update if needed
+                                send_partial_response(slot, {}, true);
+                            } else {
+                                // otherwise, for streaming without progress, signal HTTP to send the headers (i.e. 200 status)
+                                send_partial_response(slot, {}, false, true);
+                            }
                         }
                     }
 
@@ -3745,7 +3753,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
         // next responses are streamed
         // to be sent immediately
         json first_result_json = first_result->to_json();
-        if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+        if (first_result_json == nullptr) {
+            res->data = ""; // simply send HTTP headers and status code
+        } else if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
             res->data = format_anthropic_sse(first_result_json);
         } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
             res->data = format_oai_resp_sse(first_result_json);
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 00290b0782d..3616b3b4d01 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -5,9 +5,9 @@
 
 #include <cpp-httplib/httplib.h>
 
-#include <cstdlib>
 #include <functional>
 #include <future>
+#include <memory>
 #include <string>
 #include <thread>
 
@@ -21,7 +21,7 @@ class server_http_context::Impl {
 };
 
 server_http_context::server_http_context()
-    : pimpl(std::make_unique<server_http_context::Impl>())
+    : pimpl(std::make_unique<Impl>())
 {}
 
 server_http_context::~server_http_context() = default;
@@ -62,7 +62,7 @@ struct gcp_params {
     }
 
     static std::string getenv(const char * name, const std::string & default_value, bool ensure_leading_slash = false) {
-        const char * value = std::getenv(name);
+        const auto * value = std::getenv(name);
         if (value == nullptr || value[0] == '\0') {
             return default_value;
         }
@@ -94,15 +94,15 @@ bool server_http_context::init(const common_params & params) {
     auto & srv = pimpl->srv;
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+    if (!params.ssl_file_key.empty() && !params.ssl_file_cert.empty()) {
         SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
-        srv.reset(
-            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        srv = std::make_unique<httplib::SSLServer>(
+            params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()
         );
         is_ssl = true;
     } else {
         SRV_INF("%s", "running without SSL\n");
-        srv.reset(new httplib::Server());
+        srv = std::make_unique<httplib::Server>();
     }
 #else
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@@ -150,7 +150,7 @@ bool server_http_context::init(const common_params & params) {
     // set timeouts and change hostname and port
     srv->set_read_timeout (params.timeout_read);
     srv->set_write_timeout(params.timeout_write);
-    srv->set_socket_options([reuse_port = params.reuse_port](socket_t sock) {
+    srv->set_socket_options([reuse_port = params.reuse_port](const socket_t sock) {
         httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEADDR, 1);
         if (reuse_port) {
 #ifdef SO_REUSEPORT
@@ -162,8 +162,8 @@ bool server_http_context::init(const common_params & params) {
     });
 
     if (params.api_keys.size() == 1) {
-        auto key = params.api_keys[0];
-        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        const auto key = params.api_keys[0];
+        const std::string substr = key.substr(std::max(static_cast<int>(key.length() - 4), 0));
         SRV_INF("api_keys: ****%s\n", substr.c_str());
     } else if (params.api_keys.size() > 1) {
         SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
@@ -203,7 +203,7 @@ bool server_http_context::init(const common_params & params) {
         }
 
         // remove the "Bearer " prefix if needed
-        std::string prefix = "Bearer ";
+        static std::string prefix = "Bearer ";
         if (req_api_key.substr(0, prefix.size()) == prefix) {
             req_api_key = req_api_key.substr(prefix.size());
         }
@@ -232,11 +232,10 @@ bool server_http_context::init(const common_params & params) {
     };
 
     auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
-        bool ready = is_ready.load();
-        if (!ready) {
+        if (!is_ready.load()) {
 #if defined(LLAMA_UI_HAS_ASSETS)
-            auto tmp = string_split<std::string>(req.path, '.');
-            if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) {
+            if (const auto tmp = string_split<std::string>(req.path, '.');
+                req.path == "/" || (!tmp.empty() && tmp.back() == "html")) {
                 if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) {
                     res.status = 503;
                     res.set_content(reinterpret_cast<const char*>(a->data), a->size, "text/html; charset=utf-8");
@@ -284,17 +283,17 @@ bool server_http_context::init(const common_params & params) {
         return httplib::Server::HandlerResponse::Unhandled;
     });
 
-    int n_threads_http = params.n_threads_http;
+    auto n_threads_http = params.n_threads_http;
     if (n_threads_http < 1) {
         // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future
-        n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1);
+        n_threads_http = std::max(params.n_parallel + 4, static_cast<int32_t>(std::thread::hardware_concurrency() - 1));
     }
     SRV_INF("using %d threads for HTTP server\n", n_threads_http);
     srv->new_task_queue = [n_threads_http] {
         // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads
         // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request
         // ref: https://github.com/yhirose/cpp-httplib/pull/2368
-        size_t max_threads = (size_t)n_threads_http + 1024;
+        const auto max_threads = static_cast<size_t>(n_threads_http + 1024);
         return new httplib::ThreadPool(n_threads_http, max_threads);
     };
 
@@ -310,10 +309,9 @@ bool server_http_context::init(const common_params & params) {
         // register static assets routes
         if (!params.public_path.empty()) {
             // Set the base directory for serving static files
-            bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
-            if (!is_found) {
+            if (const auto is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); !is_found) {
                 SRV_ERR("static assets path not found: %s\n", params.public_path.c_str());
-                return 1;
+                return false;
             }
         } else {
 #if defined(LLAMA_UI_HAS_ASSETS)
@@ -353,9 +351,9 @@ bool server_http_context::init(const common_params & params) {
 bool server_http_context::start() {
     // Bind and listen
 
-    auto & srv = pimpl->srv;
-    bool was_bound = false;
-    bool is_sock = false;
+    const auto & srv = pimpl->srv;
+    auto was_bound = false;
+    auto is_sock = false;
     if (string_ends_with(std::string(hostname), ".sock")) {
         is_sock = true;
         SRV_INF("%s", "setting address family to AF_UNIX\n");
@@ -367,7 +365,7 @@ bool server_http_context::start() {
         SRV_INF("%s", "binding port with default address family\n");
         // bind HTTP listen port
         if (port == 0) {
-            int bound_port = srv->bind_to_any_port(hostname);
+            const auto bound_port = srv->bind_to_any_port(hostname);
             was_bound = (bound_port >= 0);
             if (was_bound) {
                 port = bound_port;
@@ -383,7 +381,7 @@ bool server_http_context::start() {
     }
 
     // run the HTTP server in a thread
-    thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
+    thread = std::thread([this] { pimpl->srv->listen_after_bind(); });
     srv->wait_until_ready();
 
     listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
@@ -440,13 +438,13 @@ static void process_handler_response(server_http_req_ptr && request, server_http
     if (response->is_stream()) {
         res.status = response->status;
         set_headers(res, response->headers);
-        std::string content_type = response->content_type;
+        const std::string content_type = response->content_type;
         // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-        std::shared_ptr<server_http_req> q_ptr = std::move(request);
-        std::shared_ptr<server_http_res> r_ptr = std::move(response);
-        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+        std::shared_ptr q_ptr = std::move(request);
+        std::shared_ptr r_ptr = std::move(response);
+        const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool {
             std::string chunk;
-            bool has_next = response->next(chunk);
+            const bool has_next = response->next(chunk);
             if (!chunk.empty()) {
                 if (!sink.write(chunk.data(), chunk.size())) {
                     return false;
@@ -557,7 +555,7 @@ static std::string path_to_gcp_format(const std::string & path) {
         if (c == '/' || c == '-' || c == '_') {
             cap = true;
         } else {
-            result += cap ? (char)std::toupper(c) : (char)c;
+            result += static_cast<char>(cap ? std::toupper(c) : c);
             cap = false;
         }
     }
@@ -581,7 +579,7 @@ static json parse_gcp_predict_response(const server_http_res_ptr & res) {
     }
 }
 
-void server_http_context::register_gcp_compat() {
+void server_http_context::register_gcp_compat() const {
     const gcp_params gcp;
 
     if (!gcp.enabled) {
@@ -602,7 +600,7 @@ void server_http_context::register_gcp_compat() {
     }
 
     if (!gcp.path_health.empty()) {
-        auto health_handler = handlers.find("/health");
+        const auto health_handler = handlers.find("/health");
         GGML_ASSERT(health_handler != handlers.end());
         get(gcp.path_health, health_handler->second);
     }
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 099b5e1cc6f..fede8c8f30a 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -73,7 +73,7 @@ struct server_http_context {
 
     std::string path_prefix;
     std::string hostname;
-    int port;
+    int port    = 8080;
     bool is_ssl = false;
 
     server_http_context();
@@ -88,7 +88,7 @@ struct server_http_context {
 
     // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
     // Must be called AFTER all other API routes are registered
-    void register_gcp_compat();
+    void register_gcp_compat() const;
 
     // for debugging
     std::string listening_address;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 47b6c2a4ec0..49b0e423f46 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -180,7 +180,8 @@ void server_model_meta::update_caps() {
             "LLAMA_ARG_HF_REPO",
             "LLAMA_ARG_HF_REPO_FILE",
         });
-        params.offline = true; // avoid any unwanted network call during capability detection
+        params.offline = true;
+        // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
         common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
         if (params.mmproj.path.empty()) {
             multimodal = { false, false };
@@ -371,18 +372,19 @@ void server_models::load_models() {
         // FIRST LOAD: add all models, then unlock for autoloading
         for (const auto & [name, preset] : final_presets) {
             server_model_meta meta{
-                /* preset       */ preset,
-                /* name         */ name,
-                /* aliases      */ {},
-                /* tags         */ {},
-                /* port         */ 0,
-                /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-                /* last_used    */ 0,
-                /* args         */ std::vector<std::string>(),
-                /* loaded_info  */ {},
-                /* exit_code    */ 0,
-                /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-                /* multimodal   */ mtmd_caps{false, false},
+                /* preset        */ preset,
+                /* name          */ name,
+                /* aliases       */ {},
+                /* tags          */ {},
+                /* port          */ 0,
+                /* status        */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used     */ 0,
+                /* args          */ std::vector<std::string>(),
+                /* loaded_info   */ {},
+                /* exit_code     */ 0,
+                /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
+                /* multimodal    */ mtmd_caps{false, false},
+                /* need_download */ false,
             };
             add_model(std::move(meta));
         }
@@ -524,18 +526,19 @@ void server_models::load_models() {
         for (const auto & [name, preset] : final_presets) {
             if (mapping.find(name) == mapping.end()) {
                 server_model_meta meta{
-                    /* preset       */ preset,
-                    /* name         */ name,
-                    /* aliases      */ {},
-                    /* tags         */ {},
-                    /* port         */ 0,
-                    /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-                    /* last_used    */ 0,
-                    /* args         */ std::vector<std::string>(),
-                    /* loaded_info  */ {},
-                    /* exit_code    */ 0,
-                    /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-                    /* multimodal   */ mtmd_caps{false, false},
+                    /* preset        */ preset,
+                    /* name          */ name,
+                    /* aliases       */ {},
+                    /* tags          */ {},
+                    /* port          */ 0,
+                    /* status        */ SERVER_MODEL_STATUS_UNLOADED,
+                    /* last_used     */ 0,
+                    /* args          */ std::vector<std::string>(),
+                    /* loaded_info   */ {},
+                    /* exit_code     */ 0,
+                    /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
+                    /* multimodal    */ mtmd_caps{false, false},
+                    /* need_download */ false,
                 };
                 add_model(std::move(meta));
                 newly_added.push_back(name);
@@ -1263,14 +1266,15 @@ void server_models_routes::init_routes() {
             };
 
             json model_info = json {
-                {"id",           meta.name},
-                {"aliases",      meta.aliases},
-                {"tags",         meta.tags},
-                {"object",       "model"},    // for OAI-compat
-                {"owned_by",     "llamacpp"}, // for OAI-compat
-                {"created",      t},          // for OAI-compat
-                {"status",       status},
-                {"architecture", architecture},
+                {"id",            meta.name},
+                {"aliases",       meta.aliases},
+                {"tags",          meta.tags},
+                {"object",        "model"},    // for OAI-compat
+                {"owned_by",      "llamacpp"}, // for OAI-compat
+                {"created",       t},          // for OAI-compat
+                {"status",        status},
+                {"architecture",  architecture},
+                {"need_download", meta.need_download},
                 // TODO: add other fields, may require reading GGUF metadata
             };
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index e96d76c9169..2198589a7aa 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -67,6 +67,7 @@ struct server_model_meta {
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
     mtmd_caps multimodal; // multimodal capabilities
+    bool need_download = false; // whether the model needs to be downloaded before loading
 
     bool is_ready() const {
         return status == SERVER_MODEL_STATUS_LOADED;
diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp
index d5fceb1b131..588e1a82b18 100644
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -381,8 +381,10 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
         if (result == nullptr) {
             // timeout, check stop condition
             if (should_stop()) {
-                SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n");
-                SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n");
+                const int64_t time_elapsed_ms = ggml_time_ms() - time_start_ms;
+                if (time_elapsed_ms > 30000) {
+                    SRV_WRN("%s", "request cancelled after 30s, potentially a client-side timeout; please check your client's code\n");
+                }
                 return nullptr;
             }
         } else {
diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h
index 35f010401fc..8ce32c69fb0 100644
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -169,6 +169,8 @@ struct server_response_reader {
     bool cancelled = false;
     int polling_interval_seconds;
 
+    const int64_t time_start_ms = ggml_time_ms();
+
     // tracking generation state and partial tool calls
     // only used by streaming completions
     std::vector<task_result_state> states;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index abc00c82bdb..ff80be6ccba 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -1422,6 +1422,9 @@ void server_task_result_cmpl_partial::update(task_result_state & state) {
 
 json server_task_result_cmpl_partial::to_json() {
     GGML_ASSERT(is_updated && "update() must be called before to_json()");
+    if (is_begin) {
+        return nullptr; // simply signal to HTTP handler to send the headers and status code
+    }
     switch (res_type) {
         case TASK_RESPONSE_TYPE_NONE:
             return to_json_non_oaicompat();
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 60e216e7927..d47dc690cff 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -47,7 +47,7 @@ enum stop_type {
 };
 
 struct task_params {
-    bool stream          = true;
+    bool stream          = false;
     bool include_usage   = false;
     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
     bool return_tokens   = false;
@@ -418,6 +418,8 @@ struct server_task_result_cmpl_partial : server_task_result {
 
     bool post_sampling_probs;
     bool is_progress = false;
+    bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream)
+                           // ref: https://github.com/ggml-org/llama.cpp/pull/23884
     completion_token_output prob_output;
     result_timings timings;
     result_prompt_progress progress;
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
index 07f079f5b51..fd866b243ea 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte
@@ -106,10 +106,14 @@
 	});
 
 	$effect(() => {
+		void modelPropsVersion;
+
 		hasAudioModality = activeModelId ? modelsStore.modelSupportsAudio(activeModelId) : false;
 	});
 
 	$effect(() => {
+		void modelPropsVersion;
+
 		hasVideoModality = activeModelId ? modelsStore.modelSupportsVideo(activeModelId) : false;
 	});
 
diff --git a/tools/ui/src/lib/enums/files.enums.ts b/tools/ui/src/lib/enums/files.enums.ts
index 2f583d52eae..8008a1040b2 100644
--- a/tools/ui/src/lib/enums/files.enums.ts
+++ b/tools/ui/src/lib/enums/files.enums.ts
@@ -186,6 +186,7 @@ export enum MimeTypeAudio {
 	WAVE = 'audio/wave',
 	X_WAV = 'audio/x-wav',
 	X_WAVE = 'audio/x-wave',
+	VND_WAVE = 'audio/vnd.wave',
 	X_PN_WAV = 'audio/x-pn-wav',
 	WEBM = 'audio/webm',
 	WEBM_OPUS = 'audio/webm;codecs=opus'
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index 3c9ca74796d..d6c7e36d70e 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -40,6 +40,7 @@ function getAudioInputFormat(mimeType: string): AudioInputFormat {
 		normalizedMimeType === MimeTypeAudio.WAVE ||
 		normalizedMimeType === MimeTypeAudio.X_WAV ||
 		normalizedMimeType === MimeTypeAudio.X_WAVE ||
+		normalizedMimeType === MimeTypeAudio.VND_WAVE ||
 		normalizedMimeType === MimeTypeAudio.X_PN_WAV
 	) {
 		return FileTypeAudio.WAV;
diff --git a/tools/ui/src/lib/utils/file-type.ts b/tools/ui/src/lib/utils/file-type.ts
index 7495163d15d..d14efbc3505 100644
--- a/tools/ui/src/lib/utils/file-type.ts
+++ b/tools/ui/src/lib/utils/file-type.ts
@@ -40,6 +40,7 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeAudio.WAVE:
 		case MimeTypeAudio.X_WAV:
 		case MimeTypeAudio.X_WAVE:
+		case MimeTypeAudio.VND_WAVE:
 		case MimeTypeAudio.X_PN_WAV:
 		case MimeTypeAudio.WEBM:
 		case MimeTypeAudio.WEBM_OPUS: