diff --git a/.github/workflows/build-llamacpp-rocm.yml b/.github/workflows/build-llamacpp-rocm.yml
index f4793d9..0478b91 100644
--- a/.github/workflows/build-llamacpp-rocm.yml
+++ b/.github/workflows/build-llamacpp-rocm.yml
@@ -9,9 +9,9 @@ on:
       gfx_target:
         description: 'AMD GPU targets (comma-separated)'
         required: false
-        default: 'gfx1151,gfx120X,gfx110X'
+        default: 'gfx1151,gfx1150,gfx120X,gfx110X'
       rocm_version:
-        description: 'ROCm version to use (e.g., 7.10.0a20251022) or "latest" to auto-detect'
+        description: 'ROCm version to use (e.g., 7.11.0a20251205) or "latest" to auto-detect'
         required: false
         default: 'latest'
       llamacpp_version:
@@ -36,7 +36,7 @@ on:
 
 env:
   OPERATING_SYSTEMS: ${{ github.event.inputs.operating_systems || 'windows,ubuntu' }}
-  GFX_TARGETS: ${{ github.event.inputs.gfx_target || 'gfx1151,gfx120X,gfx110X' }}
+  GFX_TARGETS: ${{ github.event.inputs.gfx_target || 'gfx1151,gfx1150,gfx120X,gfx110X' }}
   ROCM_VERSION: ${{ github.event.inputs.rocm_version || 'latest' }}
   LLAMACPP_VERSION: ${{ github.event.inputs.llamacpp_version || 'latest' }}
 
@@ -241,31 +241,7 @@ jobs:
         
         # Extract the tarball
         tar -xzf rocm.tar.gz -C C:\opt\rocm --strip-components=1
-        
-    #- name: Install rocWMMA headers (develop)
-    #  run: |
-    #    $repoUrl = "https://github.com/ROCm/rocWMMA.git"
-    #    $cloneDir = Join-Path $env:TEMP "rocWMMA-develop"
-    #    $destIncludeDir = "C:\opt\rocm\include"
-    #    $destHeaders = Join-Path $destIncludeDir "rocwmma"
-    #    
-    #    if (Test-Path $cloneDir) { Remove-Item -Recurse -Force $cloneDir }
-    #    Write-Host "Cloning rocWMMA develop branch..."
-    #    git clone --depth 1 --single-branch --branch develop $repoUrl $cloneDir
-    #    
-    #    $srcHeaders = Join-Path $cloneDir "library\include\rocwmma"
-    #    if (!(Test-Path $srcHeaders)) {
-    #      Write-Error "rocWMMA headers not found at expected path: $srcHeaders"
-    #      exit 1
-    #    }
-    #    
-    #    Write-Host "Installing headers to $destHeaders ..."
-    #    New-Item -ItemType Directory -Force -Path $destIncludeDir | Out-Null
-    #    if (Test-Path $destHeaders) { Remove-Item -Recurse -Force $destHeaders }
-    #    Copy-Item -Recurse -Force $srcHeaders $destHeaders
-    #    
-    #    Write-Host "rocWMMA headers (develop) installed successfully."
-        
+      
     - name: Clone llama.cpp
       run: |
         $llamacppVersion = "${{ env.LLAMACPP_VERSION }}"
@@ -287,13 +263,6 @@ jobs:
         Write-Host "Current llama.cpp commit:"
         git log --oneline -1
 
-    # - name: Apply rocWMMA patch (non-interactive)
-    #   shell: bash
-    #   run: |
-    #     chmod +x utils/rocwmma_patch.sh
-    #     # Run non-interactively; keep backups (answer 'n')
-    #     printf "n\n" | ./utils/rocwmma_patch.sh ./llama.cpp
-
     - name: Build Llama.cpp + ROCm
       shell: cmd
       run: |
@@ -306,6 +275,8 @@ jobs:
           set "mapped_target=gfx1100;gfx1101;gfx1102"
         ) else if "%current_target%"=="gfx1151" (
           set "mapped_target=gfx1151"
+        ) else if "%current_target%"=="gfx1150" (
+          set "mapped_target=gfx1150"
         ) else if "%current_target%"=="gfx120X" (
           set "mapped_target=gfx1200;gfx1201"
         ) else (
@@ -332,15 +303,16 @@ jobs:
         cmake .. -G Ninja ^
           -DCMAKE_C_COMPILER="C:\opt\rocm\lib\llvm\bin\clang.exe" ^
           -DCMAKE_CXX_COMPILER="C:\opt\rocm\lib\llvm\bin\clang++.exe" ^
+          -DCMAKE_CXX_FLAGS="-IC:\opt\rocm\include" ^
           -DCMAKE_CROSSCOMPILING=ON ^
           -DCMAKE_BUILD_TYPE=Release ^
-          -DAMDGPU_TARGETS="%mapped_target%" ^
+          -DGPU_TARGETS="%mapped_target%" ^
           -DBUILD_SHARED_LIBS=ON ^
           -DLLAMA_BUILD_TESTS=OFF ^
           -DGGML_HIP=ON ^
           -DGGML_OPENMP=OFF ^
           -DGGML_CUDA_FORCE_CUBLAS=OFF ^
-          -DGGML_HIP_ROCWMMA_FATTN=OFF ^
+          -DGGML_HIP_ROCWMMA_FATTN=ON ^
           -DLLAMA_CURL=OFF ^
           -DGGML_NATIVE=OFF ^
           -DGGML_STATIC=OFF ^
@@ -687,24 +659,7 @@ jobs:
         echo "PKG_CONFIG_PATH=/opt/rocm/lib/pkgconfig:${PKG_CONFIG_PATH:-}" >> $GITHUB_ENV
         
         echo "ROCm environment variables set successfully"
-        
-    #- name: Install rocWMMA headers (develop)
-    #  run: |
-    #    set -e
-    #    TMP_DIR="$(mktemp -d)"
-    #    echo "Using temp directory: $TMP_DIR"
-    #    git clone --depth 1 --single-branch --branch develop https://github.com/ROCm/rocWMMA.git "$TMP_DIR/rocWMMA"
-    #    SRC_DIR="$TMP_DIR/rocWMMA/library/include/rocwmma"
-    #    if [ ! -d "$SRC_DIR" ]; then
-    #      echo "rocWMMA headers not found at expected path: $SRC_DIR"
-    #      exit 1
-    #    fi
-    #    echo "Installing headers to /opt/rocm/include/rocwmma ..."
-    #    sudo mkdir -p /opt/rocm/include
-    #    sudo rm -rf /opt/rocm/include/rocwmma
-    #    sudo cp -r "$SRC_DIR" /opt/rocm/include/
-    #    rm -rf "$TMP_DIR"
-    #    echo "rocWMMA headers (develop) installed successfully."
+
         
     - name: Clone llama.cpp
       run: |
@@ -727,11 +682,6 @@ jobs:
         echo "Current llama.cpp commit:"
         git log --oneline -1
 
-    #- name: Apply rocWMMA patch (non-interactive)
-    #  run: |
-    #    chmod +x utils/rocwmma_patch.sh
-    #    # Run non-interactively; keep backups (answer 'n')
-    #    printf "n\n" | ./utils/rocwmma_patch.sh ./llama.cpp
         
     - name: Build Llama.cpp + ROCm
       run: |
@@ -743,6 +693,8 @@ jobs:
           mapped_target="gfx1100;gfx1101;gfx1102"
         elif [ "$current_target" = "gfx1151" ]; then
           mapped_target="gfx1151"
+        elif [ "$current_target" = "gfx1150" ]; then
+          mapped_target="gfx1150"
         elif [ "$current_target" = "gfx120X" ]; then
           mapped_target="gfx1200;gfx1201"
         else
@@ -759,15 +711,16 @@ jobs:
         cmake .. -G Ninja \
           -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
           -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+          -DCMAKE_CXX_FLAGS="-I/opt/rocm/include" \
           -DCMAKE_CROSSCOMPILING=ON \
           -DCMAKE_BUILD_TYPE=Release \
-          -DAMDGPU_TARGETS="$mapped_target" \
+          -DGPU_TARGETS="$mapped_target" \
           -DBUILD_SHARED_LIBS=ON \
           -DLLAMA_BUILD_TESTS=OFF \
           -DGGML_HIP=ON \
           -DGGML_OPENMP=OFF \
           -DGGML_CUDA_FORCE_CUBLAS=OFF \
-          -DGGML_HIP_ROCWMMA_FATTN=OFF \
+          -DGGML_HIP_ROCWMMA_FATTN=ON \
           -DLLAMA_CURL=OFF \
           -DGGML_NATIVE=OFF \
           -DGGML_STATIC=OFF \
@@ -931,7 +884,7 @@ jobs:
     needs: [prepare-matrix, build-windows]
     if: |
       needs.build-windows.result == 'success' &&
-      contains(github.event.inputs.gfx_target || 'gfx1151,gfx120X,gfx110X', 'gfx1151') &&
+      contains(github.event.inputs.gfx_target || 'gfx1151,gfx1150,gfx120X,gfx110X', 'gfx1151') &&
       needs.prepare-matrix.outputs.should_build_windows == 'true'
 
     steps:
@@ -1053,7 +1006,7 @@ jobs:
     needs: [prepare-matrix, build-ubuntu]
     if: |
       needs.build-ubuntu.result == 'success' &&
-      contains(github.event.inputs.gfx_target || 'gfx1151,gfx120X,gfx110X', 'gfx1151') &&
+      contains(github.event.inputs.gfx_target || 'gfx1151,gfx1150,gfx120X,gfx110X', 'gfx1151') &&
       needs.prepare-matrix.outputs.should_build_ubuntu == 'true'
 
     steps:
diff --git a/README.md b/README.md
index 799da4b..8152388 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ We provide nightly builds of **llama.cpp** with **AMD ROCm™ 7** acceleration b
 ## 🎯 Supported Devices
 
 This build specifically targets the following GPU architectures:
-- **gfx1151** (STX Halo GPUs) - Ryzen AI MAX+ Pro 395
+- **gfx1151** (STX Halo APU) - Ryzen AI MAX+ Pro 395
+- **gfx1150** (STX Point APU) - Ryzen AI 300
 - **gfx120X** (RDNA4 GPUs) - includes AMD Radeon RX 9070 XT/GRE/9070, RX 9060 XT/9060
 - **gfx110X** (RDNA3 GPUs) - includes AMD Radeon PRO W7900/W7800/W7700/W7600, RX 7900 XTX/XT/GRE, RX 7800 XT, RX 7700 XT/7700, RX 7600 XT/7600
 
@@ -41,13 +42,14 @@ This build specifically targets the following GPU architectures:
 
 Our automated GitHub Actions workflow creates nightly builds for:
 - **Windows** and **Ubuntu** operating systems
-- **Multiple GPU targets**: `gfx1151`, `gfx120X`, `gfx110X`
+- **Multiple GPU targets**: `gfx1151`, `gfx1150`, `gfx110X`, `gfx120X`
 - **ROCm™ 7 built-in** - complete runtime libraries included
 
 
 | GPU Target | Ubuntu | Windows |
 |-------------|--------|---------|
 | **gfx110X** | [![Download Ubuntu gfx110X](https://img.shields.io/badge/Download-Ubuntu%20gfx110X-blue)](https://github.com/aigdat/llamacpp-rocm/releases/latest) | [![Download Windows gfx110X](https://img.shields.io/badge/Download-Windows%20gfx110X-green)](https://github.com/aigdat/llamacpp-rocm/releases/latest) |
+| **gfx1150** | [![Download Ubuntu gfx1150](https://img.shields.io/badge/Download-Ubuntu%20gfx1150-blue)](https://github.com/aigdat/llamacpp-rocm/releases/latest) | [![Download Windows gfx1150](https://img.shields.io/badge/Download-Windows%20gfx1150-green)](https://github.com/aigdat/llamacpp-rocm/releases/latest) |
 | **gfx1151** | [![Download Ubuntu gfx1151](https://img.shields.io/badge/Download-Ubuntu%20gfx1151-blue)](https://github.com/aigdat/llamacpp-rocm/releases/latest) | [![Download Windows gfx1151](https://img.shields.io/badge/Download-Windows%20gfx1151-green)](https://github.com/aigdat/llamacpp-rocm/releases/latest) |
 | **gfx120X** | [![Download Ubuntu gfx120X](https://img.shields.io/badge/Download-Ubuntu%20gfx120X-blue)](https://github.com/aigdat/llamacpp-rocm/releases/latest) | [![Download Windows gfx120X](https://img.shields.io/badge/Download-Windows%20gfx120X-green)](https://github.com/aigdat/llamacpp-rocm/releases/latest) |
 
diff --git a/docs/manual_instructions.md b/docs/manual_instructions.md
index 9b4c9fe..66d7a41 100644
--- a/docs/manual_instructions.md
+++ b/docs/manual_instructions.md
@@ -31,16 +31,13 @@ I used chocolatey, but you can also install those manually.
 
 ### Part 2: Organizing artifacts
 * Step 1: Get the latest run id from main [here](https://github.com/ROCm/TheRock/actions/workflows/release_windows_packages.yml).
-  * Example: [TheRock/actions/runs/16218534118/job/45793425858](https://github.com/ROCm/TheRock/actions/runs/16218534118/job/45793425858)
+  * Example: [TheRock/actions/runs/19952310972/job/57214748637](https://github.com/ROCm/TheRock/actions/runs/19952310972/job/57214748637)
 * Step 2: Look at the upload logs for your target GPU (e.g., `gfx1151`), and note the Windows URL:
-  ```
-  ://therock-nightly-tarball/therock-dist-windows-gfx1151-7.0.0rc20250711.tar.gz
-  ```
-* Step 4: Download the nightly tarball 
-  * Example: `therock-nightly-tarball.s3.amazonaws.com/YOUR_FILE`
-* Step 5: Extract the contents of this tar.gz file to `C:\opt\rocm`
-* Setp 6: Add `C:\opt\rocm\lib\llvm\bin` to path
-* Step 7: clone llamacpp
+* Step 3: Download the nightly tarball for Windows
+  * Example: [therock-nightly-tarball.s3.amazonaws.com/YOUR_WINDOWS_FILE](https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-windows-gfx1151-7.11.0a20251205.tar.gz)
+* Step 4: Extract the contents of this tar.gz file to `C:\opt\rocm`
+* Setp 5: Add `C:\opt\rocm\lib\llvm\bin` to path
+* Step 6: clone llamacpp
 
 ### Part 3: Building Llama.cpp + ROCm
 
@@ -53,11 +50,11 @@ set HIP_PLATFORM=amd
 cd "C:\<YOUR_LLAMACPP_PATH>\llama.cpp"
 mkdir build
 cd build
-cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\opt\rocm\lib\llvm\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\opt\rocm\lib\llvm\bin\clang++.exe" -DCMAKE_CROSSCOMPILING=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1151" -DBUILD_SHARED_LIBS=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_HIP=ON -DGGML_OPENMP=OFF -DGGML_CUDA_FORCE_CUBLAS=OFF -DGGML_HIP_ROCWMMA_FATTN=ON -DGGML_HIP_FORCE_ROCWMMA_FATTN_GFX12=OFF -DLLAMA_CURL=OFF -DGGML_NATIVE=OFF -DGGML_STATIC=OFF -DCMAKE_SYSTEM_NAME=Windows
+cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\opt\rocm\lib\llvm\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\opt\rocm\lib\llvm\bin\clang++.exe" -DCMAKE_CXX_FLAGS="-IC:\opt\rocm\include" -DCMAKE_CROSSCOMPILING=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS="gfx1151" -DBUILD_SHARED_LIBS=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_HIP=ON -DGGML_OPENMP=OFF -DGGML_CUDA_FORCE_CUBLAS=OFF -DGGML_HIP_ROCWMMA_FATTN=ON -DLLAMA_CURL=OFF -DGGML_NATIVE=OFF -DGGML_STATIC=OFF -DCMAKE_SYSTEM_NAME=Windows
 cmake --build . -j 24 2>&1 | findstr /i "error"
 ```
 
-> **Note**: Adjust the `-DAMDGPU_TARGETS="gfx1151"` parameter for your specific GPU. See the [GPU Target Reference](#gpu-target-reference) section for details.
+> **Note**: Adjust the `-DGPU_TARGETS="gfx1151"` parameter for your specific GPU. See the [GPU Target Reference](#gpu-target-reference) section for details.
 
 If you see no errors, that means that llama.cpp has correctly been built and files are available inside your `build\bin` folder. 
 
@@ -82,14 +79,14 @@ sudo apt install -y cmake ninja-build git wget
 * Step 1: Get the latest run id from main [here](https://github.com/ROCm/TheRock/actions/workflows/release_windows_packages.yml) (same as Windows).
 * Step 2: Look at the upload logs for your target GPU (e.g., `gfx1151`), but note the Linux URLs:
   ```
-  ://therock-nightly-tarball/therock-dist-linux-gfx1151-7.0.0rc20250711.tar.gz
+  ://therock-nightly-tarball/therock-dist-linux-gfx1151-7.11.0a20251205.tar.gz
   ```
 * Step 3: Download the nightly tarball for Linux
-  * Example: [therock-nightly-tarball.s3.amazonaws.com/YOUR_LINUX_FILE](https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx1151-7.0.0rc20250711.tar.gz)
+  * Example: [therock-nightly-tarball.s3.amazonaws.com/YOUR_LINUX_FILE](https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx1151-7.11.0a20251205.tar.gz)
 * Step 4: Extract the contents of this tar.gz file to `/opt/rocm`:
   ```bash
   sudo mkdir -p /opt/rocm
-  sudo tar -xzf therock-dist-linux-gfx1151-7.0.0rc20250711.tar.gz -C /opt/rocm --strip-components=1
+  sudo tar -xzf therock-dist-linux-gfx1151-7.11.0a20251205.tar.gz -C /opt/rocm --strip-components=1
   ```
 * Step 5: Set up ROCm environment variables:
   ```bash
@@ -127,9 +124,10 @@ cd build
 cmake .. -G Ninja \
   -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
   -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+  -DCMAKE_CXX_FLAGS="-I/opt/rocm/include" \
   -DCMAKE_CROSSCOMPILING=ON \
   -DCMAKE_BUILD_TYPE=Release \
-  -DAMDGPU_TARGETS="gfx1151" \
+  -DGPU_TARGETS="gfx1151" \
   -DBUILD_SHARED_LIBS=ON \
   -DLLAMA_BUILD_TESTS=OFF \
   -DGGML_HIP=ON \
@@ -145,7 +143,7 @@ cmake .. -G Ninja \
 cmake --build . -j $(nproc)
 ```
 
-> **Note**: Adjust the `-DAMDGPU_TARGETS="gfx1151"` parameter for your specific GPU. See the [GPU Target Reference](#gpu-target-reference) section for details.
+> **Note**: Adjust the `-DGPU_TARGETS="gfx1151"` parameter for your specific GPU. See the [GPU Target Reference](#gpu-target-reference) section for details.
 
 ### Part 4: Copy required ROCm libraries
 
@@ -179,25 +177,29 @@ If you see no errors during the build process, llama.cpp has been successfully c
 
 ## 🎯 GPU Target Reference
 
-When building llama.cpp with ROCm, the `-DAMDGPU_TARGETS` parameter must be set based on your specific GPU architecture. Our automated workflow uses generic targets that get mapped to specific architectures:
+When building llama.cpp with ROCm, the `-DGPU_TARGETS` parameter must be set based on your specific GPU architecture. Our automated workflow uses generic targets that get mapped to specific architectures:
 
 - **`gfx120X`** maps to `gfx1200, gfx1201` (RDNA4 series like: RX 9070 XT/GRE/9070, RX 9060 XT/9060)
 - **`gfx110X`** maps to `gfx1100, gfx1101, gfx1102` (RDNA3 series like: PRO W7900/W7800/W7700/W7600, RX 7900 XTX/XT/GRE, RX 7800 XT, RX 7700 XT/7700, RX 7600 XT/7600)  
+- **`gfx1150`** remains as `gfx1150` (Strix Point)
 - **`gfx1151`** remains as `gfx1151` (Strix Halo)
 
 For a complete list of GPU targets and their mappings, see the [automated workflow](../.github/workflows/build-llamacpp-rocm.yml).
 
 ### How to Use
 
-Replace the `-DAMDGPU_TARGETS="gfx1151"` parameter in your cmake command with the appropriate target for your GPU:
+Replace the `-DGPU_TARGETS="gfx1151"` parameter in your cmake command with the appropriate target for your GPU:
 
 ```bash
 # For RDNA4 series (RX 9070 XT/GRE/9070, RX 9060 XT/9060)
--DAMDGPU_TARGETS="gfx1200, gfx1201"
+-DGPU_TARGETS="gfx1200, gfx1201"
 
 # For RDNA3 series (PRO W7900/W7800/W7700/W7600, RX 7900 XTX/XT/GRE, RX 7800 XT, RX 7700 XT/7700, RX 7600 XT/7600) 
--DAMDGPU_TARGETS="gfx1100"
+-DGPU_TARGETS="gfx1100;gfx1101;gfx1102"
+
+# For Strix Point
+-DGPU_TARGETS="gfx1150"
 
 # For Strix Halo
--DAMDGPU_TARGETS="gfx1151"
+-DGPU_TARGETS="gfx1151"
 ```
diff --git a/utils/rocwmma_patch.sh b/utils/rocwmma_patch.sh
deleted file mode 100644
index 2cd582a..0000000
--- a/utils/rocwmma_patch.sh
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/bin/bash
-
-# rocwmma_patch.sh - Apply rocWMMA compatibility fixes to llama.cpp
-# Usage: ./rocwmma_patch.sh <path-to-llama.cpp-directory>
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-LLAMA_DIR="${1:-}"
-
-if [[ -z "$LLAMA_DIR" ]]; then
-    echo "Usage: $0 <path-to-llama.cpp-directory>"
-    echo ""
-    echo "This script applies rocWMMA compatibility fixes to a llama.cpp checkout."
-    echo "The fixes resolve warp synchronization mask type conflicts between"
-    echo "ROCm headers and CUDA-style code when building with GGML_HIP_ROCWMMA_FATTN=ON."
-    echo ""
-    echo "Example:"
-    echo "  $0 ./llama.cpp"
-    echo "  $0 /path/to/your/llama.cpp"
-    exit 1
-fi
-
-if [[ ! -d "$LLAMA_DIR" ]]; then
-    echo "Error: Directory '$LLAMA_DIR' does not exist"
-    exit 1
-fi
-
-if [[ ! -f "$LLAMA_DIR/CMakeLists.txt" ]] || ! grep -q "llama" "$LLAMA_DIR/CMakeLists.txt" 2>/dev/null; then
-    echo "Error: '$LLAMA_DIR' does not appear to be a llama.cpp directory"
-    echo "Expected to find CMakeLists.txt with 'llama' references"
-    exit 1
-fi
-
-VENDOR_HIP_FILE="$LLAMA_DIR/ggml/src/ggml-cuda/vendors/hip.h"
-
-if [[ ! -f "$VENDOR_HIP_FILE" ]]; then
-    echo "Error: HIP vendor header not found at: $VENDOR_HIP_FILE"
-    echo "This script requires a llama.cpp version with HIP support"
-    exit 1
-fi
-
-echo "Applying rocWMMA compatibility fixes to: $LLAMA_DIR"
-echo ""
-
-# Check if fixes are already applied
-if grep -q "GGML_HIP_WARP_MASK" "$VENDOR_HIP_FILE" 2>/dev/null; then
-    echo "rocWMMA fixes appear to already be applied (found GGML_HIP_WARP_MASK)"
-    echo "To reapply, please first revert changes and run this script again"
-    exit 0
-fi
-
-echo "Step 1: Modifying HIP vendor header..."
-
-# Backup the original file
-cp "$VENDOR_HIP_FILE" "$VENDOR_HIP_FILE.backup"
-
-# Find the line with __shfl_sync and __shfl_xor_sync definitions
-SHFL_LINE=$(grep -n "^#define __shfl_sync" "$VENDOR_HIP_FILE" | head -1 | cut -d: -f1)
-
-if [[ -z "$SHFL_LINE" ]]; then
-    echo "Error: Could not find __shfl_sync macro definition in $VENDOR_HIP_FILE"
-    echo "This script may need updates for this version of llama.cpp"
-    exit 1
-fi
-
-# Create a temporary file with the fix
-{
-    # Print lines before the __shfl_sync definition
-    head -n $((SHFL_LINE - 1)) "$VENDOR_HIP_FILE"
-    
-    # Add our conditional compilation block
-    cat << 'EOF'
-#ifdef GGML_HIP_ROCWMMA_FATTN
-// ROCm requires 64-bit masks for __shfl_*_sync functions
-#define GGML_HIP_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
-#else
-#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define GGML_HIP_WARP_MASK 0xFFFFFFFF
-#endif
-EOF
-    
-    # Skip the original __shfl_sync and __shfl_xor_sync lines and print the rest
-    tail -n +$((SHFL_LINE + 2)) "$VENDOR_HIP_FILE"
-    
-} > "$VENDOR_HIP_FILE.tmp"
-
-mv "$VENDOR_HIP_FILE.tmp" "$VENDOR_HIP_FILE"
-
-echo "  ✓ Added conditional GGML_HIP_WARP_MASK macro to vendor header"
-
-echo ""
-echo "Step 2: Replacing hardcoded warp masks in CUDA files..."
-
-# Find all .cu and .cuh files in the ggml/src/ggml-cuda directory
-CUDA_FILES=($(find "$LLAMA_DIR/ggml/src/ggml-cuda" -name "*.cu" -o -name "*.cuh" 2>/dev/null | sort))
-
-if [[ ${#CUDA_FILES[@]} -eq 0 ]]; then
-    echo "Warning: No CUDA files found in $LLAMA_DIR/ggml/src/ggml-cuda"
-    echo "This may be expected for some llama.cpp versions"
-else
-    MODIFIED_COUNT=0
-    
-    for file in "${CUDA_FILES[@]}"; do
-        # Check if file contains the hardcoded masks
-        if grep -q "0xFFFFFFFF\|0xffffffff" "$file" 2>/dev/null; then
-            # Create backup
-            cp "$file" "$file.backup"
-            
-            # Replace both uppercase and lowercase versions
-            sed -i 's/0xFFFFFFFF/GGML_HIP_WARP_MASK/g; s/0xffffffff/GGML_HIP_WARP_MASK/g' "$file"
-            
-            MODIFIED_COUNT=$((MODIFIED_COUNT + 1))
-            echo "  ✓ Modified: $(basename "$file")"
-        fi
-    done
-    
-    echo "  ✓ Modified $MODIFIED_COUNT CUDA files"
-fi
-
-echo ""
-echo "Step 3: Verification..."
-
-# Verify the vendor header was modified correctly
-if grep -q "GGML_HIP_ROCWMMA_FATTN" "$VENDOR_HIP_FILE" && grep -q "GGML_HIP_WARP_MASK" "$VENDOR_HIP_FILE"; then
-    echo "  ✓ Vendor header modification verified"
-else
-    echo "  ✗ Vendor header modification failed"
-    # Restore backup
-    mv "$VENDOR_HIP_FILE.backup" "$VENDOR_HIP_FILE"
-    echo "  ✓ Restored original vendor header"
-    exit 1
-fi
-
-echo ""
-echo "🎉 rocWMMA compatibility fixes applied successfully!"
-echo ""
-echo "What was changed:"
-echo "  • Added conditional GGML_HIP_WARP_MASK macro to ggml/src/ggml-cuda/vendors/hip.h"
-echo "  • Replaced hardcoded 0xFFFFFFFF/0xffffffff with GGML_HIP_WARP_MASK in CUDA files"
-echo ""
-echo "Behavior:"
-echo "  • For regular HIP builds: GGML_HIP_WARP_MASK = 0xFFFFFFFF (no change)"
-echo "  • For rocWMMA builds: GGML_HIP_WARP_MASK = 0xFFFFFFFFFFFFFFFFULL (64-bit masks)"
-echo ""
-echo "To build with rocWMMA support, use:"
-echo "  cmake -B build -S '$LLAMA_DIR' -DGGML_HIP=ON -DAMDGPU_TARGETS=\"gfx1151\" -DGGML_HIP_ROCWMMA_FATTN=ON"
-echo ""
-echo "Backup files were created with .backup extension in case you need to revert."
-
-# Clean up backup files from CUDA directory on success
-echo ""
-read -p "Remove backup files? (y/N): " -n 1 -r
-echo
-if [[ $REPLY =~ ^[Yy]$ ]]; then
-    find "$LLAMA_DIR/ggml/src/ggml-cuda" -name "*.backup" -delete 2>/dev/null || true
-    rm -f "$VENDOR_HIP_FILE.backup"
-    echo "  ✓ Backup files removed"
-else
-    echo "  ℹ Backup files kept for safety"
-fi
-
-echo ""
-echo "Done! Your llama.cpp checkout now supports rocWMMA builds."
\ No newline at end of file